Add spring-configurable Tika-powered metadata extractor, content transformer and extractor

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22683 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-07-31 17:39:05 +00:00 · 2010-09-24 11:02:22 +00:00
parent f95cb3c51b
commit 1b62e9bc01
4 changed files with 536 additions and 0 deletions
--- a/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.java
@@ -0,0 +1,116 @@
 /*
 * Copyright (C) 2005-2010 Alfresco Software Limited.
 *
 * This file is part of Alfresco
 *
 * Alfresco is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Alfresco is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 */
 package org.alfresco.repo.content.metadata;
 import java.util.ArrayList;
 import java.util.HashSet;
 import org.alfresco.error.AlfrescoRuntimeException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 /**
 * A Metadata Extractor which makes use of Apache Tika,
 *  and allows the selection of the Tika parser to be
 *  sprung-in to extract the metadata from your document.
 * This is typically used with custom Tika Parsers.
 * <pre>
 *   <b>author:</b>                 --      cm:author
 *   <b>title:</b>                  --      cm:title
 *   <b>subject:</b>                --      cm:description
 *   <b>created:</b>                --      cm:created
 *   <b>comments:</b>
 *   <p>geo:lat:</b>                --      cm:latitude
 *   <p>geo:long:</b>               --      cm:longitude
 * </pre>
 * 
 * @author Nick Burch
 */
 public class TikaSpringConfiguredMetadataExtracter extends TikaPoweredMetadataExtracter
 {
    protected static Log logger = LogFactory.getLog(TikaSpringConfiguredMetadataExtracter.class);
    private Parser tikaParser;
    private String tikaParserClassName;
    private Class<? extends Parser> tikaParserClass;
    /**
     * Injects the name of the Tika parser to use
     * @param className
     */
    @SuppressWarnings("unchecked")
    public void setTikaParserName(String className)
    {
       tikaParserClassName = className;
       // Load the class
       try {
          tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName);
          setTikaParser(getParser());
       } catch(ClassNotFoundException e) {
          throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
       }
    }
    /**
     * Injects the Tika parser to use
     * @param parser
     */
    public void setTikaParser(Parser tikaParser)
    {
       this.tikaParser = tikaParser;
       // Build the mime types, updating the copy our parent
       //  holds for us as we go along
       ArrayList<String> mimetypes = new ArrayList<String>();
       for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
       {
          mimetypes.add( mt.toString() );
       }
       super.setSupportedMimetypes(mimetypes);
    }
    public TikaSpringConfiguredMetadataExtracter()
    {
       super(new HashSet<String>());
    }
    /**
     * Returns the Tika parser
     */
    protected Parser getParser()
    {
       // If we were given a whole parser, return it
       if(tikaParser != null)
          return tikaParser;
       // Otherwise create a new one
       try {
          return tikaParserClass.newInstance();
       } catch (InstantiationException e) {
          throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
       } catch (IllegalAccessException e) {
          throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
       }
    }
 }
--- a/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.properties
+++ b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.properties
@@ -0,0 +1,20 @@
 #
 # TikaSpringConfiguredMetadataExtracter.properties - default mapping
 #
 # This is used to map from the Tika and standard namespaces
 #  onto your content model. This is used for custom tika parsers,
 #  but one file is used across all custom parsers.
 #
 # author: Nick Burch
 # Namespaces
 namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
 # Mappings
 author=cm:author
 title=cm:title
 description=cm:description
 created=cm:created
 geo\:lat=cm:latitude
 geo\:long=cm:longitude
--- a/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java
@@ -0,0 +1,300 @@
 /*
 * Copyright (C) 2005-2010 Alfresco Software Limited.
 *
 * This file is part of Alfresco
 *
 * Alfresco is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Alfresco is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 */
 package org.alfresco.repo.content.transform;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.StringTokenizer;
 import org.alfresco.error.AlfrescoRuntimeException;
 import org.alfresco.model.ContentModel;
 import org.alfresco.repo.action.ParameterDefinitionImpl;
 import org.alfresco.repo.action.executer.ActionExecuterAbstractBase;
 import org.alfresco.service.cmr.action.Action;
 import org.alfresco.service.cmr.action.ParameterDefinition;
 import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
 import org.alfresco.service.cmr.repository.ContentReader;
 import org.alfresco.service.cmr.repository.ContentService;
 import org.alfresco.service.cmr.repository.ContentWriter;
 import org.alfresco.service.cmr.repository.NodeRef;
 import org.alfresco.service.cmr.repository.NodeService;
 import org.alfresco.service.namespace.QName;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.ContainerAwareDetector;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedResourceHandler;
 import org.apache.tika.extractor.ParserContainerExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 /**
 * Warning - this is a prototype service, and will likely change dramatically
 *  in Alfresco 4.0!
 * 
 * This proto-service provides a way to have Apache Tika extract out
 *  certain kinds of embedded resources from within a container file.
 * 
 * One use might be to extract all the images in a zip file, another might
 *  be to fetch all the Word Documents embedded in an Excel Spreadsheet.  
 *
 * Uses the Apache Tika ContainerExtractor framework, along with the
 *  Apache Tika Auto-Parser.
 *  
 * Not sprung-in by default, you will need to manually list this in
 *  an extension context file.
 * 
 * @author Nick Burch
 */
 public class TikaPoweredContainerExtractor
 {
    private static final Log logger = LogFactory.getLog(TikaPoweredContainerExtractor.class);
    private NodeService nodeService;
    private ContentService contentService;
    private AutoDetectParser parser;
    private Detector detector;
    public TikaPoweredContainerExtractor() 
    {
       TikaConfig config = TikaConfig.getDefaultConfig();
       detector = new ContainerAwareDetector(
             config.getMimeRepository()
       );
       parser = new AutoDetectParser(detector);
    }
    /**
     * Injects the nodeService bean.
     * 
     * @param nodeService the nodeService.
     */
    public void setNodeService(NodeService nodeService)
    {
        this.nodeService = nodeService;
    }
    /**
     * Injects the contentService bean.
     * 
     * @param contentService the contentService.
     */
    public void setContentService(ContentService contentService)
    {
        this.contentService = contentService;
    }
    /**
     * Extracts out all the entries from the container
     *  that match the supplied list of mime types.
     * If no mime types are specified, extracts all
     *  available embedded resources. 
     */
    public List<NodeRef> extract(NodeRef source, List<String> mimetypes)
    {
       // Grab the directory to put the nodes into
       // Will be the parent folder of the source
       NodeRef folder = nodeService.getPrimaryParent(source).getParentRef();
       // Get the contents
       ContentReader reader = contentService.getReader(source, ContentModel.PROP_CONTENT);
       TikaInputStream stream = TikaInputStream.get(reader.getContentInputStream());
       // Build the recursing parser
       Extractor handler = new Extractor(folder, mimetypes);
       // Have Tika look for things
       ParserContainerExtractor extractor = new ParserContainerExtractor(
             parser, detector
       );
       try {
          logger.info("Beginning extraction of " + source.toString());
          extractor.extract(stream, null, handler);
          logger.info("Completed extraction of " + source.toString());
       } catch(TikaException te) {
          throw new AlfrescoRuntimeException("Extraction Failed", te);
       } catch(IOException ie) {
          throw new AlfrescoRuntimeException("Extraction Failed", ie);
       }
       // Tidy up
       try {
          stream.close();
       } catch(IOException e) {}
       // All done
       return handler.extracted;
    }
    /**
     * This EmbeddedResourceHandler is called by Tika for each
     *  embedded resource. It decides if the resource is to
     *  be extracted or not, and if it is, saves it into the
     *  specified folder.
     */
    private class Extractor implements EmbeddedResourceHandler
    {
       private List<NodeRef> extracted;
       private Set<MediaType> acceptTypes;
       private NodeRef folder;
       private int anonymousCount = 0;
       private Extractor(NodeRef folder, List<String> types)
       {
          this.folder = folder;
          this.extracted = new ArrayList<NodeRef>();
          if(types != null && types.size() > 0)
          {
             acceptTypes = new HashSet<MediaType>();
             for(String type : types)
             {
                acceptTypes.add(MediaType.parse(type));
             }
          }
       }
       @Override
       public void handle(String filename, MediaType mediaType,
             InputStream stream) {
          // Do we want it?
          if(acceptTypes == null || acceptTypes.contains(mediaType)) 
          {
             // Ensure we have a filename
             if(filename == null) 
             {
                anonymousCount++;
                filename = "embedded"+anonymousCount+"."+mediaType.getSubtype();
             }
             logger.info("Extracting embedded " + mediaType +  " entry " + filename);
             // Save it
             Map<QName,Serializable> properties = new HashMap<QName,Serializable>();
             properties.put(ContentModel.PROP_NAME, filename);
             NodeRef node = nodeService.createNode(
                   folder,
                   ContentModel.ASSOC_CONTAINS,
                   QName.createQName(filename),
                   ContentModel.TYPE_CONTENT,
                   properties
             ).getChildRef();
             ContentWriter writer = contentService.getWriter(
                   node, ContentModel.PROP_CONTENT, true
             );
             writer.setMimetype(mediaType.toString());
             writer.putContent(stream);
          }
          else
          {
             logger.info("Skipping embedded " + mediaType +  " entry " + filename);
          }
       }
    }
    /**
     * This action executor allows you to trigger extraction as an
     *  action, perhaps from a rule. 
     * 
     * Not sprung-in by default, you will need to manually list this in
     *  an extension context file. You will also need to add properties
     *  files entries.
     */
    public static class ExtractorActionExecutor extends ActionExecuterAbstractBase
    {
      public static final String NAME = "extractEmbeddedResources";
      public static final String PARAM_MIME_TYPES = "mime-types";
      private TikaPoweredContainerExtractor extractor;
      public void setTikaPoweredContainerExtractor(TikaPoweredContainerExtractor extractor)
      {
         this.extractor = extractor;
      }
      @Override
      protected void addParameterDefinitions(List<ParameterDefinition> paramList) {
         paramList.add(new ParameterDefinitionImpl(
               PARAM_MIME_TYPES,
               DataTypeDefinition.TEXT,
               false,
               getParamDisplayLabel(PARAM_MIME_TYPES)
         ));
      }
      @Override
      protected void executeImpl(Action action, NodeRef actionedUponNodeRef) {
         List<String> mimeTypes = null;
         String rawTypes = (String)action.getParameterValue(PARAM_MIME_TYPES);
         if(rawTypes != null && rawTypes.length() > 0)
         {
            mimeTypes = new ArrayList<String>();
            StringTokenizer st = new StringTokenizer(rawTypes, ",");
            while(st.hasMoreTokens())
            {
               mimeTypes.add( st.nextToken().trim() );
            }
         }
         extractor.extract(actionedUponNodeRef, mimeTypes);
      }
    }
 /*
 <?xml version='1.0' encoding='UTF-8'?>
 <!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>
 <beans>
   <bean id="tikaPoweredContainerExtractor" class="org.alfresco.repo.content.transform.TikaPoweredContainerExtractor">
       <property name="nodeService">
          <ref bean="NodeService" />
       </property>
       <property name="contentService">
          <ref bean="ContentService" />
       </property>
   </bean>
   <bean id="extractEmbeddedResources" class="org.alfresco.repo.content.transform.TikaPoweredContainerExtractor$ExtractorActionExecutor" parent="action-executer">
       <property name="tikaPoweredContainerExtractor">
          <ref bean="tikaPoweredContainerExtractor" />
       </property>
   </bean>
   <bean id="extractEmbeddedResources-action-messages" class="org.alfresco.i18n.ResourceBundleBootstrapComponent">
       <property name="resourceBundles">
          <list>
            <value>alfresco.extension.extractor-action-messages</value>
          </list>
        </property>
   </bean>
 </beans> 
 */
 /*
 extractEmbeddedResources.title=Extract embedded resources
 extractEmbeddedResources.description=Extract resources from within container files, such as .zip or .docx
 extractEmbeddedResources.param_mime-types.display-label=Mime Types
 */
 }
--- a/source/java/org/alfresco/repo/content/transform/TikaSpringConfiguredContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaSpringConfiguredContentTransformer.java
@@ -0,0 +1,100 @@
 /*
 * Copyright (C) 2005-2010 Alfresco Software Limited.
 *
 * This file is part of Alfresco
 *
 * Alfresco is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Alfresco is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 */
 package org.alfresco.repo.content.transform;
 import java.util.ArrayList;
 import org.alfresco.error.AlfrescoRuntimeException;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 /**
 * A Content Extractor for XML, HTML and Text, which makes
 *  use of Apache Tika, and allows the selection of the
 *  Tika parser to be sprung-in.
 * Using spring, you list the Tika parser to use, which
 *  may well not be a standard Tika one. You should specify
 *  either a spring created bean, or a parser class name.
 * 
 * @author Nick Burch
 */
 public class TikaSpringConfiguredContentTransformer extends TikaPoweredContentTransformer
 {
    private Parser tikaParser;
    private String tikaParserClassName;
    private Class<? extends Parser> tikaParserClass;
    /**
     * Injects the name of the Tika parser to use
     * @param className
     */
    @SuppressWarnings("unchecked")
    public void setTikaParserName(String className)
    {
       tikaParserClassName = className;
       // Load the class
       try {
          tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName);
          setTikaParser(getParser());
       } catch(ClassNotFoundException e) {
          throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
       }
    }
    /**
     * Injects the Tika parser to use
     * @param parser
     */
    public void setTikaParser(Parser tikaParser)
    {
       this.tikaParser = tikaParser;
       // Build the mime types, updating the copy our parent
       //  holds for us as we go along
       for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
       {
          super.sourceMimeTypes.add( mt.toString() );
       }
    }
    public TikaSpringConfiguredContentTransformer() {
       super(new ArrayList<String>());
    }
    /**
     * Returns the Tika parser
     */
    protected Parser getParser()
    {
       // If we were given a whole parser, return it
       if(tikaParser != null)
          return tikaParser;
       // Otherwise create a new one
       try {
          return tikaParserClass.newInstance();
       } catch (InstantiationException e) {
          throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
       } catch (IllegalAccessException e) {
          throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
       }
    }
 }