Add spring-configurable Tika-powered metadata extractor, content transformer and extractor

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22683 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-07-31 17:39:05 +00:00 · 2010-09-24 11:02:22 +00:00
parent f95cb3c51b
commit 1b62e9bc01
4 changed files with 536 additions and 0 deletions
--- a/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.java
@@ -0,0 +1,116 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.metadata;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+
+import org.alfresco.error.AlfrescoRuntimeException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+/**
+ * A Metadata Extractor which makes use of Apache Tika,
+ *  and allows the selection of the Tika parser to be
+ *  sprung-in to extract the metadata from your document.
+ * This is typically used with custom Tika Parsers.
+
+ * <pre>
+ *   <b>author:</b>                 --      cm:author
+ *   <b>title:</b>                  --      cm:title
+ *   <b>subject:</b>                --      cm:description
+ *   <b>created:</b>                --      cm:created
+ *   <b>comments:</b>
+ *   <p>geo:lat:</b>                --      cm:latitude
+ *   <p>geo:long:</b>               --      cm:longitude
+ * </pre>
+ * 
+ * @author Nick Burch
+ */
+public class TikaSpringConfiguredMetadataExtracter extends TikaPoweredMetadataExtracter
+{
+    protected static Log logger = LogFactory.getLog(TikaSpringConfiguredMetadataExtracter.class);
+
+    private Parser tikaParser;
+    private String tikaParserClassName;
+    private Class<? extends Parser> tikaParserClass;
+   
+    /**
+     * Injects the name of the Tika parser to use
+     * @param className
+     */
+    @SuppressWarnings("unchecked")
+    public void setTikaParserName(String className)
+    {
+       tikaParserClassName = className;
+       
+       // Load the class
+       try {
+          tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName);
+          setTikaParser(getParser());
+       } catch(ClassNotFoundException e) {
+          throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
+       }
+    }
+    
+    /**
+     * Injects the Tika parser to use
+     * @param parser
+     */
+    public void setTikaParser(Parser tikaParser)
+    {
+       this.tikaParser = tikaParser;
+       
+       // Build the mime types, updating the copy our parent
+       //  holds for us as we go along
+       ArrayList<String> mimetypes = new ArrayList<String>();
+       for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
+       {
+          mimetypes.add( mt.toString() );
+       }
+       super.setSupportedMimetypes(mimetypes);
+    }
+    
+    public TikaSpringConfiguredMetadataExtracter()
+    {
+       super(new HashSet<String>());
+    }
+    
+    /**
+     * Returns the Tika parser
+     */
+    protected Parser getParser()
+    {
+       // If we were given a whole parser, return it
+       if(tikaParser != null)
+          return tikaParser;
+       
+       // Otherwise create a new one
+       try {
+          return tikaParserClass.newInstance();
+       } catch (InstantiationException e) {
+          throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
+       } catch (IllegalAccessException e) {
+          throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
+       }
+    }
+}
--- a/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.properties
+++ b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.properties
@@ -0,0 +1,20 @@
+#
+# TikaSpringConfiguredMetadataExtracter.properties - default mapping
+#
+# This is used to map from the Tika and standard namespaces
+#  onto your content model. This is used for custom tika parsers,
+#  but one file is used across all custom parsers.
+#
+# author: Nick Burch
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+
+# Mappings
+author=cm:author
+title=cm:title
+description=cm:description
+created=cm:created
+
+geo\:lat=cm:latitude
+geo\:long=cm:longitude
--- a/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java
@@ -0,0 +1,300 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.transform;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+import org.alfresco.error.AlfrescoRuntimeException;
+import org.alfresco.model.ContentModel;
+import org.alfresco.repo.action.ParameterDefinitionImpl;
+import org.alfresco.repo.action.executer.ActionExecuterAbstractBase;
+import org.alfresco.service.cmr.action.Action;
+import org.alfresco.service.cmr.action.ParameterDefinition;
+import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
+import org.alfresco.service.cmr.repository.ContentReader;
+import org.alfresco.service.cmr.repository.ContentService;
+import org.alfresco.service.cmr.repository.ContentWriter;
+import org.alfresco.service.cmr.repository.NodeRef;
+import org.alfresco.service.cmr.repository.NodeService;
+import org.alfresco.service.namespace.QName;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.ContainerAwareDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+
+/**
+ * Warning - this is a prototype service, and will likely change dramatically
+ *  in Alfresco 4.0!
+ * 
+ * This proto-service provides a way to have Apache Tika extract out
+ *  certain kinds of embedded resources from within a container file.
+ * 
+ * One use might be to extract all the images in a zip file, another might
+ *  be to fetch all the Word Documents embedded in an Excel Spreadsheet.  
+ *
+ * Uses the Apache Tika ContainerExtractor framework, along with the
+ *  Apache Tika Auto-Parser.
+ *  
+ * Not sprung-in by default, you will need to manually list this in
+ *  an extension context file.
+ * 
+ * @author Nick Burch
+ */
+public class TikaPoweredContainerExtractor
+{
+    private static final Log logger = LogFactory.getLog(TikaPoweredContainerExtractor.class);
+    
+    private NodeService nodeService;
+    private ContentService contentService;
+    
+    private AutoDetectParser parser;
+    private Detector detector;
+
+    public TikaPoweredContainerExtractor() 
+    {
+       TikaConfig config = TikaConfig.getDefaultConfig();
+       detector = new ContainerAwareDetector(
+             config.getMimeRepository()
+       );
+       parser = new AutoDetectParser(detector);
+    }
+    
+    /**
+     * Injects the nodeService bean.
+     * 
+     * @param nodeService the nodeService.
+     */
+    public void setNodeService(NodeService nodeService)
+    {
+        this.nodeService = nodeService;
+    }
+
+    /**
+     * Injects the contentService bean.
+     * 
+     * @param contentService the contentService.
+     */
+    public void setContentService(ContentService contentService)
+    {
+        this.contentService = contentService;
+    }
+
+    /**
+     * Extracts out all the entries from the container
+     *  that match the supplied list of mime types.
+     * If no mime types are specified, extracts all
+     *  available embedded resources. 
+     */
+    public List<NodeRef> extract(NodeRef source, List<String> mimetypes)
+    {
+       // Grab the directory to put the nodes into
+       // Will be the parent folder of the source
+       NodeRef folder = nodeService.getPrimaryParent(source).getParentRef();
+       
+       // Get the contents
+       ContentReader reader = contentService.getReader(source, ContentModel.PROP_CONTENT);
+       TikaInputStream stream = TikaInputStream.get(reader.getContentInputStream());
+
+       // Build the recursing parser
+       Extractor handler = new Extractor(folder, mimetypes);
+       
+       // Have Tika look for things
+       ParserContainerExtractor extractor = new ParserContainerExtractor(
+             parser, detector
+       );
+       try {
+          logger.info("Beginning extraction of " + source.toString());
+          extractor.extract(stream, null, handler);
+          logger.info("Completed extraction of " + source.toString());
+       } catch(TikaException te) {
+          throw new AlfrescoRuntimeException("Extraction Failed", te);
+       } catch(IOException ie) {
+          throw new AlfrescoRuntimeException("Extraction Failed", ie);
+       }
+       
+       // Tidy up
+       try {
+          stream.close();
+       } catch(IOException e) {}
+       
+       // All done
+       return handler.extracted;
+    }
+    
+    /**
+     * This EmbeddedResourceHandler is called by Tika for each
+     *  embedded resource. It decides if the resource is to
+     *  be extracted or not, and if it is, saves it into the
+     *  specified folder.
+     */
+    private class Extractor implements EmbeddedResourceHandler
+    {
+       private List<NodeRef> extracted;
+       private Set<MediaType> acceptTypes;
+       private NodeRef folder;
+       private int anonymousCount = 0;
+       
+       private Extractor(NodeRef folder, List<String> types)
+       {
+          this.folder = folder;
+          this.extracted = new ArrayList<NodeRef>();
+          
+          if(types != null && types.size() > 0)
+          {
+             acceptTypes = new HashSet<MediaType>();
+             for(String type : types)
+             {
+                acceptTypes.add(MediaType.parse(type));
+             }
+          }
+       }
+       
+       @Override
+       public void handle(String filename, MediaType mediaType,
+             InputStream stream) {
+          // Do we want it?
+          if(acceptTypes == null || acceptTypes.contains(mediaType)) 
+          {
+             // Ensure we have a filename
+             if(filename == null) 
+             {
+                anonymousCount++;
+                filename = "embedded"+anonymousCount+"."+mediaType.getSubtype();
+             }
+             
+             logger.info("Extracting embedded " + mediaType +  " entry " + filename);
+             
+             // Save it
+             Map<QName,Serializable> properties = new HashMap<QName,Serializable>();
+             properties.put(ContentModel.PROP_NAME, filename);
+             NodeRef node = nodeService.createNode(
+                   folder,
+                   ContentModel.ASSOC_CONTAINS,
+                   QName.createQName(filename),
+                   ContentModel.TYPE_CONTENT,
+                   properties
+             ).getChildRef();
+             
+             ContentWriter writer = contentService.getWriter(
+                   node, ContentModel.PROP_CONTENT, true
+             );
+             writer.setMimetype(mediaType.toString());
+             writer.putContent(stream);
+          }
+          else
+          {
+             logger.info("Skipping embedded " + mediaType +  " entry " + filename);
+          }
+       }
+    }
+
+    /**
+     * This action executor allows you to trigger extraction as an
+     *  action, perhaps from a rule. 
+     * 
+     * Not sprung-in by default, you will need to manually list this in
+     *  an extension context file. You will also need to add properties
+     *  files entries.
+     */
+    public static class ExtractorActionExecutor extends ActionExecuterAbstractBase
+    {
+      public static final String NAME = "extractEmbeddedResources";
+      public static final String PARAM_MIME_TYPES = "mime-types";
+
+      private TikaPoweredContainerExtractor extractor;
+      public void setTikaPoweredContainerExtractor(TikaPoweredContainerExtractor extractor)
+      {
+         this.extractor = extractor;
+      }
+      
+      @Override
+      protected void addParameterDefinitions(List<ParameterDefinition> paramList) {
+         paramList.add(new ParameterDefinitionImpl(
+               PARAM_MIME_TYPES,
+               DataTypeDefinition.TEXT,
+               false,
+               getParamDisplayLabel(PARAM_MIME_TYPES)
+         ));
+      }
+      
+      @Override
+      protected void executeImpl(Action action, NodeRef actionedUponNodeRef) {
+         List<String> mimeTypes = null;
+         String rawTypes = (String)action.getParameterValue(PARAM_MIME_TYPES);
+         if(rawTypes != null && rawTypes.length() > 0)
+         {
+            mimeTypes = new ArrayList<String>();
+            StringTokenizer st = new StringTokenizer(rawTypes, ",");
+            while(st.hasMoreTokens())
+            {
+               mimeTypes.add( st.nextToken().trim() );
+            }
+         }
+            
+         extractor.extract(actionedUponNodeRef, mimeTypes);
+      }
+    }
+/*
+<?xml version='1.0' encoding='UTF-8'?>
+<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>
+<beans>
+   <bean id="tikaPoweredContainerExtractor" class="org.alfresco.repo.content.transform.TikaPoweredContainerExtractor">
+       <property name="nodeService">
+          <ref bean="NodeService" />
+       </property>
+       <property name="contentService">
+          <ref bean="ContentService" />
+       </property>
+   </bean>
+   <bean id="extractEmbeddedResources" class="org.alfresco.repo.content.transform.TikaPoweredContainerExtractor$ExtractorActionExecutor" parent="action-executer">
+       <property name="tikaPoweredContainerExtractor">
+          <ref bean="tikaPoweredContainerExtractor" />
+       </property>
+   </bean>
+   <bean id="extractEmbeddedResources-action-messages" class="org.alfresco.i18n.ResourceBundleBootstrapComponent">
+       <property name="resourceBundles">
+          <list>
+            <value>alfresco.extension.extractor-action-messages</value>
+          </list>
+        </property>
+   </bean>
+</beans> 
+ */
+/*
+extractEmbeddedResources.title=Extract embedded resources
+extractEmbeddedResources.description=Extract resources from within container files, such as .zip or .docx
+extractEmbeddedResources.param_mime-types.display-label=Mime Types
+ */
+}
--- a/source/java/org/alfresco/repo/content/transform/TikaSpringConfiguredContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaSpringConfiguredContentTransformer.java
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.transform;
+
+import java.util.ArrayList;
+
+import org.alfresco.error.AlfrescoRuntimeException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+/**
+ * A Content Extractor for XML, HTML and Text, which makes
+ *  use of Apache Tika, and allows the selection of the
+ *  Tika parser to be sprung-in.
+ * Using spring, you list the Tika parser to use, which
+ *  may well not be a standard Tika one. You should specify
+ *  either a spring created bean, or a parser class name.
+ * 
+ * @author Nick Burch
+ */
+public class TikaSpringConfiguredContentTransformer extends TikaPoweredContentTransformer
+{
+    private Parser tikaParser;
+    private String tikaParserClassName;
+    private Class<? extends Parser> tikaParserClass;
+   
+    /**
+     * Injects the name of the Tika parser to use
+     * @param className
+     */
+    @SuppressWarnings("unchecked")
+    public void setTikaParserName(String className)
+    {
+       tikaParserClassName = className;
+       
+       // Load the class
+       try {
+          tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName);
+          setTikaParser(getParser());
+       } catch(ClassNotFoundException e) {
+          throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
+       }
+    }
+    
+    /**
+     * Injects the Tika parser to use
+     * @param parser
+     */
+    public void setTikaParser(Parser tikaParser)
+    {
+       this.tikaParser = tikaParser;
+       
+       // Build the mime types, updating the copy our parent
+       //  holds for us as we go along
+       for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
+       {
+          super.sourceMimeTypes.add( mt.toString() );
+       }
+    }
+   
+    public TikaSpringConfiguredContentTransformer() {
+       super(new ArrayList<String>());
+    }
+    
+    /**
+     * Returns the Tika parser
+     */
+    protected Parser getParser()
+    {
+       // If we were given a whole parser, return it
+       if(tikaParser != null)
+          return tikaParser;
+       
+       // Otherwise create a new one
+       try {
+          return tikaParserClass.newInstance();
+       } catch (InstantiationException e) {
+          throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
+       } catch (IllegalAccessException e) {
+          throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
+       }
+    }
+}