Add spring-configurable Tika-powered metadata extractor, content transformer and extractor

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22683 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-09-24 11:02:22 +00:00
parent f95cb3c51b
commit 1b62e9bc01
4 changed files with 536 additions and 0 deletions

View File

@@ -0,0 +1,116 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.metadata;
import java.util.ArrayList;
import java.util.HashSet;
import org.alfresco.error.AlfrescoRuntimeException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
/**
* A Metadata Extractor which makes use of Apache Tika,
* and allows the selection of the Tika parser to be
* sprung-in to extract the metadata from your document.
* This is typically used with custom Tika Parsers.
* <pre>
* <b>author:</b> -- cm:author
* <b>title:</b> -- cm:title
* <b>subject:</b> -- cm:description
* <b>created:</b> -- cm:created
* <b>comments:</b>
* <p>geo:lat:</b> -- cm:latitude
* <p>geo:long:</b> -- cm:longitude
* </pre>
*
* @author Nick Burch
*/
public class TikaSpringConfiguredMetadataExtracter extends TikaPoweredMetadataExtracter
{
protected static Log logger = LogFactory.getLog(TikaSpringConfiguredMetadataExtracter.class);
private Parser tikaParser;
private String tikaParserClassName;
private Class<? extends Parser> tikaParserClass;
/**
* Injects the name of the Tika parser to use
* @param className
*/
@SuppressWarnings("unchecked")
public void setTikaParserName(String className)
{
tikaParserClassName = className;
// Load the class
try {
tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName);
setTikaParser(getParser());
} catch(ClassNotFoundException e) {
throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
}
}
/**
* Injects the Tika parser to use
* @param parser
*/
public void setTikaParser(Parser tikaParser)
{
this.tikaParser = tikaParser;
// Build the mime types, updating the copy our parent
// holds for us as we go along
ArrayList<String> mimetypes = new ArrayList<String>();
for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
{
mimetypes.add( mt.toString() );
}
super.setSupportedMimetypes(mimetypes);
}
public TikaSpringConfiguredMetadataExtracter()
{
super(new HashSet<String>());
}
/**
* Returns the Tika parser
*/
protected Parser getParser()
{
// If we were given a whole parser, return it
if(tikaParser != null)
return tikaParser;
// Otherwise create a new one
try {
return tikaParserClass.newInstance();
} catch (InstantiationException e) {
throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
} catch (IllegalAccessException e) {
throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
}
}
}

View File

@@ -0,0 +1,20 @@
#
# TikaSpringConfiguredMetadataExtracter.properties - default mapping
#
# This is used to map from the Tika and standard namespaces
# onto your content model. This is used for custom tika parsers,
# but one file is used across all custom parsers.
#
# author: Nick Burch
# Namespaces
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
# Mappings
author=cm:author
title=cm:title
description=cm:description
created=cm:created
geo\:lat=cm:latitude
geo\:long=cm:longitude

View File

@@ -0,0 +1,300 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.action.ParameterDefinitionImpl;
import org.alfresco.repo.action.executer.ActionExecuterAbstractBase;
import org.alfresco.service.cmr.action.Action;
import org.alfresco.service.cmr.action.ParameterDefinition;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentService;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.NodeRef;
import org.alfresco.service.cmr.repository.NodeService;
import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.ContainerAwareDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
/**
* Warning - this is a prototype service, and will likely change dramatically
* in Alfresco 4.0!
*
* This proto-service provides a way to have Apache Tika extract out
* certain kinds of embedded resources from within a container file.
*
* One use might be to extract all the images in a zip file, another might
* be to fetch all the Word Documents embedded in an Excel Spreadsheet.
*
* Uses the Apache Tika ContainerExtractor framework, along with the
* Apache Tika Auto-Parser.
*
* Not sprung-in by default, you will need to manually list this in
* an extension context file.
*
* @author Nick Burch
*/
public class TikaPoweredContainerExtractor
{
private static final Log logger = LogFactory.getLog(TikaPoweredContainerExtractor.class);
private NodeService nodeService;
private ContentService contentService;
private AutoDetectParser parser;
private Detector detector;
public TikaPoweredContainerExtractor()
{
TikaConfig config = TikaConfig.getDefaultConfig();
detector = new ContainerAwareDetector(
config.getMimeRepository()
);
parser = new AutoDetectParser(detector);
}
/**
* Injects the nodeService bean.
*
* @param nodeService the nodeService.
*/
public void setNodeService(NodeService nodeService)
{
this.nodeService = nodeService;
}
/**
* Injects the contentService bean.
*
* @param contentService the contentService.
*/
public void setContentService(ContentService contentService)
{
this.contentService = contentService;
}
/**
* Extracts out all the entries from the container
* that match the supplied list of mime types.
* If no mime types are specified, extracts all
* available embedded resources.
*/
public List<NodeRef> extract(NodeRef source, List<String> mimetypes)
{
// Grab the directory to put the nodes into
// Will be the parent folder of the source
NodeRef folder = nodeService.getPrimaryParent(source).getParentRef();
// Get the contents
ContentReader reader = contentService.getReader(source, ContentModel.PROP_CONTENT);
TikaInputStream stream = TikaInputStream.get(reader.getContentInputStream());
// Build the recursing parser
Extractor handler = new Extractor(folder, mimetypes);
// Have Tika look for things
ParserContainerExtractor extractor = new ParserContainerExtractor(
parser, detector
);
try {
logger.info("Beginning extraction of " + source.toString());
extractor.extract(stream, null, handler);
logger.info("Completed extraction of " + source.toString());
} catch(TikaException te) {
throw new AlfrescoRuntimeException("Extraction Failed", te);
} catch(IOException ie) {
throw new AlfrescoRuntimeException("Extraction Failed", ie);
}
// Tidy up
try {
stream.close();
} catch(IOException e) {}
// All done
return handler.extracted;
}
/**
* This EmbeddedResourceHandler is called by Tika for each
* embedded resource. It decides if the resource is to
* be extracted or not, and if it is, saves it into the
* specified folder.
*/
private class Extractor implements EmbeddedResourceHandler
{
private List<NodeRef> extracted;
private Set<MediaType> acceptTypes;
private NodeRef folder;
private int anonymousCount = 0;
private Extractor(NodeRef folder, List<String> types)
{
this.folder = folder;
this.extracted = new ArrayList<NodeRef>();
if(types != null && types.size() > 0)
{
acceptTypes = new HashSet<MediaType>();
for(String type : types)
{
acceptTypes.add(MediaType.parse(type));
}
}
}
@Override
public void handle(String filename, MediaType mediaType,
InputStream stream) {
// Do we want it?
if(acceptTypes == null || acceptTypes.contains(mediaType))
{
// Ensure we have a filename
if(filename == null)
{
anonymousCount++;
filename = "embedded"+anonymousCount+"."+mediaType.getSubtype();
}
logger.info("Extracting embedded " + mediaType + " entry " + filename);
// Save it
Map<QName,Serializable> properties = new HashMap<QName,Serializable>();
properties.put(ContentModel.PROP_NAME, filename);
NodeRef node = nodeService.createNode(
folder,
ContentModel.ASSOC_CONTAINS,
QName.createQName(filename),
ContentModel.TYPE_CONTENT,
properties
).getChildRef();
ContentWriter writer = contentService.getWriter(
node, ContentModel.PROP_CONTENT, true
);
writer.setMimetype(mediaType.toString());
writer.putContent(stream);
}
else
{
logger.info("Skipping embedded " + mediaType + " entry " + filename);
}
}
}
/**
* This action executor allows you to trigger extraction as an
* action, perhaps from a rule.
*
* Not sprung-in by default, you will need to manually list this in
* an extension context file. You will also need to add properties
* files entries.
*/
public static class ExtractorActionExecutor extends ActionExecuterAbstractBase
{
public static final String NAME = "extractEmbeddedResources";
public static final String PARAM_MIME_TYPES = "mime-types";
private TikaPoweredContainerExtractor extractor;
public void setTikaPoweredContainerExtractor(TikaPoweredContainerExtractor extractor)
{
this.extractor = extractor;
}
@Override
protected void addParameterDefinitions(List<ParameterDefinition> paramList) {
paramList.add(new ParameterDefinitionImpl(
PARAM_MIME_TYPES,
DataTypeDefinition.TEXT,
false,
getParamDisplayLabel(PARAM_MIME_TYPES)
));
}
@Override
protected void executeImpl(Action action, NodeRef actionedUponNodeRef) {
List<String> mimeTypes = null;
String rawTypes = (String)action.getParameterValue(PARAM_MIME_TYPES);
if(rawTypes != null && rawTypes.length() > 0)
{
mimeTypes = new ArrayList<String>();
StringTokenizer st = new StringTokenizer(rawTypes, ",");
while(st.hasMoreTokens())
{
mimeTypes.add( st.nextToken().trim() );
}
}
extractor.extract(actionedUponNodeRef, mimeTypes);
}
}
/*
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>
<beans>
<bean id="tikaPoweredContainerExtractor" class="org.alfresco.repo.content.transform.TikaPoweredContainerExtractor">
<property name="nodeService">
<ref bean="NodeService" />
</property>
<property name="contentService">
<ref bean="ContentService" />
</property>
</bean>
<bean id="extractEmbeddedResources" class="org.alfresco.repo.content.transform.TikaPoweredContainerExtractor$ExtractorActionExecutor" parent="action-executer">
<property name="tikaPoweredContainerExtractor">
<ref bean="tikaPoweredContainerExtractor" />
</property>
</bean>
<bean id="extractEmbeddedResources-action-messages" class="org.alfresco.i18n.ResourceBundleBootstrapComponent">
<property name="resourceBundles">
<list>
<value>alfresco.extension.extractor-action-messages</value>
</list>
</property>
</bean>
</beans>
*/
/*
extractEmbeddedResources.title=Extract embedded resources
extractEmbeddedResources.description=Extract resources from within container files, such as .zip or .docx
extractEmbeddedResources.param_mime-types.display-label=Mime Types
*/
}

View File

@@ -0,0 +1,100 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import java.util.ArrayList;
import org.alfresco.error.AlfrescoRuntimeException;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
/**
* A Content Extractor for XML, HTML and Text, which makes
* use of Apache Tika, and allows the selection of the
* Tika parser to be sprung-in.
* Using spring, you list the Tika parser to use, which
* may well not be a standard Tika one. You should specify
* either a spring created bean, or a parser class name.
*
* @author Nick Burch
*/
public class TikaSpringConfiguredContentTransformer extends TikaPoweredContentTransformer
{
private Parser tikaParser;
private String tikaParserClassName;
private Class<? extends Parser> tikaParserClass;
/**
* Injects the name of the Tika parser to use
* @param className
*/
@SuppressWarnings("unchecked")
public void setTikaParserName(String className)
{
tikaParserClassName = className;
// Load the class
try {
tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName);
setTikaParser(getParser());
} catch(ClassNotFoundException e) {
throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
}
}
/**
* Injects the Tika parser to use
* @param parser
*/
public void setTikaParser(Parser tikaParser)
{
this.tikaParser = tikaParser;
// Build the mime types, updating the copy our parent
// holds for us as we go along
for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
{
super.sourceMimeTypes.add( mt.toString() );
}
}
public TikaSpringConfiguredContentTransformer() {
super(new ArrayList<String>());
}
/**
* Returns the Tika parser
*/
protected Parser getParser()
{
// If we were given a whole parser, return it
if(tikaParser != null)
return tikaParser;
// Otherwise create a new one
try {
return tikaParserClass.newInstance();
} catch (InstantiationException e) {
throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
} catch (IllegalAccessException e) {
throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
}
}
}