diff --git a/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.java new file mode 100644 index 0000000000..6e4447d6c9 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.java @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.metadata; + +import java.util.ArrayList; +import java.util.HashSet; + +import org.alfresco.error.AlfrescoRuntimeException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; + +/** + * A Metadata Extractor which makes use of Apache Tika, + * and allows the selection of the Tika parser to be + * sprung-in to extract the metadata from your document. + * This is typically used with custom Tika Parsers. + + *
+ *   author:                 --      cm:author
+ *   title:                  --      cm:title
+ *   subject:                --      cm:description
+ *   created:                --      cm:created
+ *   comments:
+ *   

geo:lat: -- cm:latitude + *

geo:long: -- cm:longitude + *

+ * + * @author Nick Burch + */ +public class TikaSpringConfiguredMetadataExtracter extends TikaPoweredMetadataExtracter +{ + protected static Log logger = LogFactory.getLog(TikaSpringConfiguredMetadataExtracter.class); + + private Parser tikaParser; + private String tikaParserClassName; + private Class tikaParserClass; + + /** + * Injects the name of the Tika parser to use + * @param className + */ + @SuppressWarnings("unchecked") + public void setTikaParserName(String className) + { + tikaParserClassName = className; + + // Load the class + try { + tikaParserClass = (Class)Class.forName(tikaParserClassName); + setTikaParser(getParser()); + } catch(ClassNotFoundException e) { + throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found"); + } + } + + /** + * Injects the Tika parser to use + * @param parser + */ + public void setTikaParser(Parser tikaParser) + { + this.tikaParser = tikaParser; + + // Build the mime types, updating the copy our parent + // holds for us as we go along + ArrayList mimetypes = new ArrayList(); + for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext())) + { + mimetypes.add( mt.toString() ); + } + super.setSupportedMimetypes(mimetypes); + } + + public TikaSpringConfiguredMetadataExtracter() + { + super(new HashSet()); + } + + /** + * Returns the Tika parser + */ + protected Parser getParser() + { + // If we were given a whole parser, return it + if(tikaParser != null) + return tikaParser; + + // Otherwise create a new one + try { + return tikaParserClass.newInstance(); + } catch (InstantiationException e) { + throw new AlfrescoRuntimeException("Unable to create specified Parser", e); + } catch (IllegalAccessException e) { + throw new AlfrescoRuntimeException("Unable to create specified Parser", e); + } + } +} diff --git a/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.properties b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.properties new file mode 100644 index 0000000000..5a323c946f --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.properties @@ -0,0 +1,20 @@ +# +# TikaSpringConfiguredMetadataExtracter.properties - default mapping +# +# This is used to map from the Tika and standard namespaces +# onto your content model. This is used for custom tika parsers, +# but one file is used across all custom parsers. +# +# author: Nick Burch + +# Namespaces +namespace.prefix.cm=http://www.alfresco.org/model/content/1.0 + +# Mappings +author=cm:author +title=cm:title +description=cm:description +created=cm:created + +geo\:lat=cm:latitude +geo\:long=cm:longitude diff --git a/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java b/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java new file mode 100644 index 0000000000..9ad65aa033 --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java @@ -0,0 +1,300 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.transform; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.StringTokenizer; + +import org.alfresco.error.AlfrescoRuntimeException; +import org.alfresco.model.ContentModel; +import org.alfresco.repo.action.ParameterDefinitionImpl; +import org.alfresco.repo.action.executer.ActionExecuterAbstractBase; +import org.alfresco.service.cmr.action.Action; +import org.alfresco.service.cmr.action.ParameterDefinition; +import org.alfresco.service.cmr.dictionary.DataTypeDefinition; +import org.alfresco.service.cmr.repository.ContentReader; +import org.alfresco.service.cmr.repository.ContentService; +import org.alfresco.service.cmr.repository.ContentWriter; +import org.alfresco.service.cmr.repository.NodeRef; +import org.alfresco.service.cmr.repository.NodeService; +import org.alfresco.service.namespace.QName; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.ContainerAwareDetector; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedResourceHandler; +import org.apache.tika.extractor.ParserContainerExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; + +/** + * Warning - this is a prototype service, and will likely change dramatically + * in Alfresco 4.0! + * + * This proto-service provides a way to have Apache Tika extract out + * certain kinds of embedded resources from within a container file. + * + * One use might be to extract all the images in a zip file, another might + * be to fetch all the Word Documents embedded in an Excel Spreadsheet. + * + * Uses the Apache Tika ContainerExtractor framework, along with the + * Apache Tika Auto-Parser. + * + * Not sprung-in by default, you will need to manually list this in + * an extension context file. + * + * @author Nick Burch + */ +public class TikaPoweredContainerExtractor +{ + private static final Log logger = LogFactory.getLog(TikaPoweredContainerExtractor.class); + + private NodeService nodeService; + private ContentService contentService; + + private AutoDetectParser parser; + private Detector detector; + + public TikaPoweredContainerExtractor() + { + TikaConfig config = TikaConfig.getDefaultConfig(); + detector = new ContainerAwareDetector( + config.getMimeRepository() + ); + parser = new AutoDetectParser(detector); + } + + /** + * Injects the nodeService bean. + * + * @param nodeService the nodeService. + */ + public void setNodeService(NodeService nodeService) + { + this.nodeService = nodeService; + } + + /** + * Injects the contentService bean. + * + * @param contentService the contentService. + */ + public void setContentService(ContentService contentService) + { + this.contentService = contentService; + } + + /** + * Extracts out all the entries from the container + * that match the supplied list of mime types. + * If no mime types are specified, extracts all + * available embedded resources. + */ + public List extract(NodeRef source, List mimetypes) + { + // Grab the directory to put the nodes into + // Will be the parent folder of the source + NodeRef folder = nodeService.getPrimaryParent(source).getParentRef(); + + // Get the contents + ContentReader reader = contentService.getReader(source, ContentModel.PROP_CONTENT); + TikaInputStream stream = TikaInputStream.get(reader.getContentInputStream()); + + // Build the recursing parser + Extractor handler = new Extractor(folder, mimetypes); + + // Have Tika look for things + ParserContainerExtractor extractor = new ParserContainerExtractor( + parser, detector + ); + try { + logger.info("Beginning extraction of " + source.toString()); + extractor.extract(stream, null, handler); + logger.info("Completed extraction of " + source.toString()); + } catch(TikaException te) { + throw new AlfrescoRuntimeException("Extraction Failed", te); + } catch(IOException ie) { + throw new AlfrescoRuntimeException("Extraction Failed", ie); + } + + // Tidy up + try { + stream.close(); + } catch(IOException e) {} + + // All done + return handler.extracted; + } + + /** + * This EmbeddedResourceHandler is called by Tika for each + * embedded resource. It decides if the resource is to + * be extracted or not, and if it is, saves it into the + * specified folder. + */ + private class Extractor implements EmbeddedResourceHandler + { + private List extracted; + private Set acceptTypes; + private NodeRef folder; + private int anonymousCount = 0; + + private Extractor(NodeRef folder, List types) + { + this.folder = folder; + this.extracted = new ArrayList(); + + if(types != null && types.size() > 0) + { + acceptTypes = new HashSet(); + for(String type : types) + { + acceptTypes.add(MediaType.parse(type)); + } + } + } + + @Override + public void handle(String filename, MediaType mediaType, + InputStream stream) { + // Do we want it? + if(acceptTypes == null || acceptTypes.contains(mediaType)) + { + // Ensure we have a filename + if(filename == null) + { + anonymousCount++; + filename = "embedded"+anonymousCount+"."+mediaType.getSubtype(); + } + + logger.info("Extracting embedded " + mediaType + " entry " + filename); + + // Save it + Map properties = new HashMap(); + properties.put(ContentModel.PROP_NAME, filename); + NodeRef node = nodeService.createNode( + folder, + ContentModel.ASSOC_CONTAINS, + QName.createQName(filename), + ContentModel.TYPE_CONTENT, + properties + ).getChildRef(); + + ContentWriter writer = contentService.getWriter( + node, ContentModel.PROP_CONTENT, true + ); + writer.setMimetype(mediaType.toString()); + writer.putContent(stream); + } + else + { + logger.info("Skipping embedded " + mediaType + " entry " + filename); + } + } + } + + /** + * This action executor allows you to trigger extraction as an + * action, perhaps from a rule. + * + * Not sprung-in by default, you will need to manually list this in + * an extension context file. You will also need to add properties + * files entries. + */ + public static class ExtractorActionExecutor extends ActionExecuterAbstractBase + { + public static final String NAME = "extractEmbeddedResources"; + public static final String PARAM_MIME_TYPES = "mime-types"; + + private TikaPoweredContainerExtractor extractor; + public void setTikaPoweredContainerExtractor(TikaPoweredContainerExtractor extractor) + { + this.extractor = extractor; + } + + @Override + protected void addParameterDefinitions(List paramList) { + paramList.add(new ParameterDefinitionImpl( + PARAM_MIME_TYPES, + DataTypeDefinition.TEXT, + false, + getParamDisplayLabel(PARAM_MIME_TYPES) + )); + } + + @Override + protected void executeImpl(Action action, NodeRef actionedUponNodeRef) { + List mimeTypes = null; + String rawTypes = (String)action.getParameterValue(PARAM_MIME_TYPES); + if(rawTypes != null && rawTypes.length() > 0) + { + mimeTypes = new ArrayList(); + StringTokenizer st = new StringTokenizer(rawTypes, ","); + while(st.hasMoreTokens()) + { + mimeTypes.add( st.nextToken().trim() ); + } + } + + extractor.extract(actionedUponNodeRef, mimeTypes); + } + } +/* + + + + + + + + + + + + + + + + + + + + alfresco.extension.extractor-action-messages + + + + + */ +/* +extractEmbeddedResources.title=Extract embedded resources +extractEmbeddedResources.description=Extract resources from within container files, such as .zip or .docx +extractEmbeddedResources.param_mime-types.display-label=Mime Types + */ +} diff --git a/source/java/org/alfresco/repo/content/transform/TikaSpringConfiguredContentTransformer.java b/source/java/org/alfresco/repo/content/transform/TikaSpringConfiguredContentTransformer.java new file mode 100644 index 0000000000..bace2fde1e --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/TikaSpringConfiguredContentTransformer.java @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.transform; + +import java.util.ArrayList; + +import org.alfresco.error.AlfrescoRuntimeException; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; + +/** + * A Content Extractor for XML, HTML and Text, which makes + * use of Apache Tika, and allows the selection of the + * Tika parser to be sprung-in. + * Using spring, you list the Tika parser to use, which + * may well not be a standard Tika one. You should specify + * either a spring created bean, or a parser class name. + * + * @author Nick Burch + */ +public class TikaSpringConfiguredContentTransformer extends TikaPoweredContentTransformer +{ + private Parser tikaParser; + private String tikaParserClassName; + private Class tikaParserClass; + + /** + * Injects the name of the Tika parser to use + * @param className + */ + @SuppressWarnings("unchecked") + public void setTikaParserName(String className) + { + tikaParserClassName = className; + + // Load the class + try { + tikaParserClass = (Class)Class.forName(tikaParserClassName); + setTikaParser(getParser()); + } catch(ClassNotFoundException e) { + throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found"); + } + } + + /** + * Injects the Tika parser to use + * @param parser + */ + public void setTikaParser(Parser tikaParser) + { + this.tikaParser = tikaParser; + + // Build the mime types, updating the copy our parent + // holds for us as we go along + for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext())) + { + super.sourceMimeTypes.add( mt.toString() ); + } + } + + public TikaSpringConfiguredContentTransformer() { + super(new ArrayList()); + } + + /** + * Returns the Tika parser + */ + protected Parser getParser() + { + // If we were given a whole parser, return it + if(tikaParser != null) + return tikaParser; + + // Otherwise create a new one + try { + return tikaParserClass.newInstance(); + } catch (InstantiationException e) { + throw new AlfrescoRuntimeException("Unable to create specified Parser", e); + } catch (IllegalAccessException e) { + throw new AlfrescoRuntimeException("Unable to create specified Parser", e); + } + } +}