mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Add spring-configurable Tika-powered metadata extractor, content transformer and extractor
git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22683 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -0,0 +1,116 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.alfresco.error.AlfrescoRuntimeException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
|
||||
/**
|
||||
* A Metadata Extractor which makes use of Apache Tika,
|
||||
* and allows the selection of the Tika parser to be
|
||||
* sprung-in to extract the metadata from your document.
|
||||
* This is typically used with custom Tika Parsers.
|
||||
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>subject:</b> -- cm:description
|
||||
* <b>created:</b> -- cm:created
|
||||
* <b>comments:</b>
|
||||
* <p>geo:lat:</b> -- cm:latitude
|
||||
* <p>geo:long:</b> -- cm:longitude
|
||||
* </pre>
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class TikaSpringConfiguredMetadataExtracter extends TikaPoweredMetadataExtracter
|
||||
{
|
||||
protected static Log logger = LogFactory.getLog(TikaSpringConfiguredMetadataExtracter.class);
|
||||
|
||||
private Parser tikaParser;
|
||||
private String tikaParserClassName;
|
||||
private Class<? extends Parser> tikaParserClass;
|
||||
|
||||
/**
|
||||
* Injects the name of the Tika parser to use
|
||||
* @param className
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public void setTikaParserName(String className)
|
||||
{
|
||||
tikaParserClassName = className;
|
||||
|
||||
// Load the class
|
||||
try {
|
||||
tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName);
|
||||
setTikaParser(getParser());
|
||||
} catch(ClassNotFoundException e) {
|
||||
throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Injects the Tika parser to use
|
||||
* @param parser
|
||||
*/
|
||||
public void setTikaParser(Parser tikaParser)
|
||||
{
|
||||
this.tikaParser = tikaParser;
|
||||
|
||||
// Build the mime types, updating the copy our parent
|
||||
// holds for us as we go along
|
||||
ArrayList<String> mimetypes = new ArrayList<String>();
|
||||
for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
|
||||
{
|
||||
mimetypes.add( mt.toString() );
|
||||
}
|
||||
super.setSupportedMimetypes(mimetypes);
|
||||
}
|
||||
|
||||
public TikaSpringConfiguredMetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the Tika parser
|
||||
*/
|
||||
protected Parser getParser()
|
||||
{
|
||||
// If we were given a whole parser, return it
|
||||
if(tikaParser != null)
|
||||
return tikaParser;
|
||||
|
||||
// Otherwise create a new one
|
||||
try {
|
||||
return tikaParserClass.newInstance();
|
||||
} catch (InstantiationException e) {
|
||||
throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,20 @@
|
||||
#
|
||||
# TikaSpringConfiguredMetadataExtracter.properties - default mapping
|
||||
#
|
||||
# This is used to map from the Tika and standard namespaces
|
||||
# onto your content model. This is used for custom tika parsers,
|
||||
# but one file is used across all custom parsers.
|
||||
#
|
||||
# author: Nick Burch
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
description=cm:description
|
||||
created=cm:created
|
||||
|
||||
geo\:lat=cm:latitude
|
||||
geo\:long=cm:longitude
|
@@ -0,0 +1,300 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import org.alfresco.error.AlfrescoRuntimeException;
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.action.ParameterDefinitionImpl;
|
||||
import org.alfresco.repo.action.executer.ActionExecuterAbstractBase;
|
||||
import org.alfresco.service.cmr.action.Action;
|
||||
import org.alfresco.service.cmr.action.ParameterDefinition;
|
||||
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentService;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.NodeRef;
|
||||
import org.alfresco.service.cmr.repository.NodeService;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.detect.ContainerAwareDetector;
|
||||
import org.apache.tika.detect.Detector;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.extractor.EmbeddedResourceHandler;
|
||||
import org.apache.tika.extractor.ParserContainerExtractor;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
|
||||
/**
|
||||
* Warning - this is a prototype service, and will likely change dramatically
|
||||
* in Alfresco 4.0!
|
||||
*
|
||||
* This proto-service provides a way to have Apache Tika extract out
|
||||
* certain kinds of embedded resources from within a container file.
|
||||
*
|
||||
* One use might be to extract all the images in a zip file, another might
|
||||
* be to fetch all the Word Documents embedded in an Excel Spreadsheet.
|
||||
*
|
||||
* Uses the Apache Tika ContainerExtractor framework, along with the
|
||||
* Apache Tika Auto-Parser.
|
||||
*
|
||||
* Not sprung-in by default, you will need to manually list this in
|
||||
* an extension context file.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class TikaPoweredContainerExtractor
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(TikaPoweredContainerExtractor.class);
|
||||
|
||||
private NodeService nodeService;
|
||||
private ContentService contentService;
|
||||
|
||||
private AutoDetectParser parser;
|
||||
private Detector detector;
|
||||
|
||||
public TikaPoweredContainerExtractor()
|
||||
{
|
||||
TikaConfig config = TikaConfig.getDefaultConfig();
|
||||
detector = new ContainerAwareDetector(
|
||||
config.getMimeRepository()
|
||||
);
|
||||
parser = new AutoDetectParser(detector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Injects the nodeService bean.
|
||||
*
|
||||
* @param nodeService the nodeService.
|
||||
*/
|
||||
public void setNodeService(NodeService nodeService)
|
||||
{
|
||||
this.nodeService = nodeService;
|
||||
}
|
||||
|
||||
/**
|
||||
* Injects the contentService bean.
|
||||
*
|
||||
* @param contentService the contentService.
|
||||
*/
|
||||
public void setContentService(ContentService contentService)
|
||||
{
|
||||
this.contentService = contentService;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts out all the entries from the container
|
||||
* that match the supplied list of mime types.
|
||||
* If no mime types are specified, extracts all
|
||||
* available embedded resources.
|
||||
*/
|
||||
public List<NodeRef> extract(NodeRef source, List<String> mimetypes)
|
||||
{
|
||||
// Grab the directory to put the nodes into
|
||||
// Will be the parent folder of the source
|
||||
NodeRef folder = nodeService.getPrimaryParent(source).getParentRef();
|
||||
|
||||
// Get the contents
|
||||
ContentReader reader = contentService.getReader(source, ContentModel.PROP_CONTENT);
|
||||
TikaInputStream stream = TikaInputStream.get(reader.getContentInputStream());
|
||||
|
||||
// Build the recursing parser
|
||||
Extractor handler = new Extractor(folder, mimetypes);
|
||||
|
||||
// Have Tika look for things
|
||||
ParserContainerExtractor extractor = new ParserContainerExtractor(
|
||||
parser, detector
|
||||
);
|
||||
try {
|
||||
logger.info("Beginning extraction of " + source.toString());
|
||||
extractor.extract(stream, null, handler);
|
||||
logger.info("Completed extraction of " + source.toString());
|
||||
} catch(TikaException te) {
|
||||
throw new AlfrescoRuntimeException("Extraction Failed", te);
|
||||
} catch(IOException ie) {
|
||||
throw new AlfrescoRuntimeException("Extraction Failed", ie);
|
||||
}
|
||||
|
||||
// Tidy up
|
||||
try {
|
||||
stream.close();
|
||||
} catch(IOException e) {}
|
||||
|
||||
// All done
|
||||
return handler.extracted;
|
||||
}
|
||||
|
||||
/**
|
||||
* This EmbeddedResourceHandler is called by Tika for each
|
||||
* embedded resource. It decides if the resource is to
|
||||
* be extracted or not, and if it is, saves it into the
|
||||
* specified folder.
|
||||
*/
|
||||
private class Extractor implements EmbeddedResourceHandler
|
||||
{
|
||||
private List<NodeRef> extracted;
|
||||
private Set<MediaType> acceptTypes;
|
||||
private NodeRef folder;
|
||||
private int anonymousCount = 0;
|
||||
|
||||
private Extractor(NodeRef folder, List<String> types)
|
||||
{
|
||||
this.folder = folder;
|
||||
this.extracted = new ArrayList<NodeRef>();
|
||||
|
||||
if(types != null && types.size() > 0)
|
||||
{
|
||||
acceptTypes = new HashSet<MediaType>();
|
||||
for(String type : types)
|
||||
{
|
||||
acceptTypes.add(MediaType.parse(type));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handle(String filename, MediaType mediaType,
|
||||
InputStream stream) {
|
||||
// Do we want it?
|
||||
if(acceptTypes == null || acceptTypes.contains(mediaType))
|
||||
{
|
||||
// Ensure we have a filename
|
||||
if(filename == null)
|
||||
{
|
||||
anonymousCount++;
|
||||
filename = "embedded"+anonymousCount+"."+mediaType.getSubtype();
|
||||
}
|
||||
|
||||
logger.info("Extracting embedded " + mediaType + " entry " + filename);
|
||||
|
||||
// Save it
|
||||
Map<QName,Serializable> properties = new HashMap<QName,Serializable>();
|
||||
properties.put(ContentModel.PROP_NAME, filename);
|
||||
NodeRef node = nodeService.createNode(
|
||||
folder,
|
||||
ContentModel.ASSOC_CONTAINS,
|
||||
QName.createQName(filename),
|
||||
ContentModel.TYPE_CONTENT,
|
||||
properties
|
||||
).getChildRef();
|
||||
|
||||
ContentWriter writer = contentService.getWriter(
|
||||
node, ContentModel.PROP_CONTENT, true
|
||||
);
|
||||
writer.setMimetype(mediaType.toString());
|
||||
writer.putContent(stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.info("Skipping embedded " + mediaType + " entry " + filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This action executor allows you to trigger extraction as an
|
||||
* action, perhaps from a rule.
|
||||
*
|
||||
* Not sprung-in by default, you will need to manually list this in
|
||||
* an extension context file. You will also need to add properties
|
||||
* files entries.
|
||||
*/
|
||||
public static class ExtractorActionExecutor extends ActionExecuterAbstractBase
|
||||
{
|
||||
public static final String NAME = "extractEmbeddedResources";
|
||||
public static final String PARAM_MIME_TYPES = "mime-types";
|
||||
|
||||
private TikaPoweredContainerExtractor extractor;
|
||||
public void setTikaPoweredContainerExtractor(TikaPoweredContainerExtractor extractor)
|
||||
{
|
||||
this.extractor = extractor;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void addParameterDefinitions(List<ParameterDefinition> paramList) {
|
||||
paramList.add(new ParameterDefinitionImpl(
|
||||
PARAM_MIME_TYPES,
|
||||
DataTypeDefinition.TEXT,
|
||||
false,
|
||||
getParamDisplayLabel(PARAM_MIME_TYPES)
|
||||
));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void executeImpl(Action action, NodeRef actionedUponNodeRef) {
|
||||
List<String> mimeTypes = null;
|
||||
String rawTypes = (String)action.getParameterValue(PARAM_MIME_TYPES);
|
||||
if(rawTypes != null && rawTypes.length() > 0)
|
||||
{
|
||||
mimeTypes = new ArrayList<String>();
|
||||
StringTokenizer st = new StringTokenizer(rawTypes, ",");
|
||||
while(st.hasMoreTokens())
|
||||
{
|
||||
mimeTypes.add( st.nextToken().trim() );
|
||||
}
|
||||
}
|
||||
|
||||
extractor.extract(actionedUponNodeRef, mimeTypes);
|
||||
}
|
||||
}
|
||||
/*
|
||||
<?xml version='1.0' encoding='UTF-8'?>
|
||||
<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>
|
||||
<beans>
|
||||
<bean id="tikaPoweredContainerExtractor" class="org.alfresco.repo.content.transform.TikaPoweredContainerExtractor">
|
||||
<property name="nodeService">
|
||||
<ref bean="NodeService" />
|
||||
</property>
|
||||
<property name="contentService">
|
||||
<ref bean="ContentService" />
|
||||
</property>
|
||||
</bean>
|
||||
<bean id="extractEmbeddedResources" class="org.alfresco.repo.content.transform.TikaPoweredContainerExtractor$ExtractorActionExecutor" parent="action-executer">
|
||||
<property name="tikaPoweredContainerExtractor">
|
||||
<ref bean="tikaPoweredContainerExtractor" />
|
||||
</property>
|
||||
</bean>
|
||||
<bean id="extractEmbeddedResources-action-messages" class="org.alfresco.i18n.ResourceBundleBootstrapComponent">
|
||||
<property name="resourceBundles">
|
||||
<list>
|
||||
<value>alfresco.extension.extractor-action-messages</value>
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
</beans>
|
||||
*/
|
||||
/*
|
||||
extractEmbeddedResources.title=Extract embedded resources
|
||||
extractEmbeddedResources.description=Extract resources from within container files, such as .zip or .docx
|
||||
extractEmbeddedResources.param_mime-types.display-label=Mime Types
|
||||
*/
|
||||
}
|
@@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.alfresco.error.AlfrescoRuntimeException;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
|
||||
/**
|
||||
* A Content Extractor for XML, HTML and Text, which makes
|
||||
* use of Apache Tika, and allows the selection of the
|
||||
* Tika parser to be sprung-in.
|
||||
* Using spring, you list the Tika parser to use, which
|
||||
* may well not be a standard Tika one. You should specify
|
||||
* either a spring created bean, or a parser class name.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class TikaSpringConfiguredContentTransformer extends TikaPoweredContentTransformer
|
||||
{
|
||||
private Parser tikaParser;
|
||||
private String tikaParserClassName;
|
||||
private Class<? extends Parser> tikaParserClass;
|
||||
|
||||
/**
|
||||
* Injects the name of the Tika parser to use
|
||||
* @param className
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public void setTikaParserName(String className)
|
||||
{
|
||||
tikaParserClassName = className;
|
||||
|
||||
// Load the class
|
||||
try {
|
||||
tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName);
|
||||
setTikaParser(getParser());
|
||||
} catch(ClassNotFoundException e) {
|
||||
throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Injects the Tika parser to use
|
||||
* @param parser
|
||||
*/
|
||||
public void setTikaParser(Parser tikaParser)
|
||||
{
|
||||
this.tikaParser = tikaParser;
|
||||
|
||||
// Build the mime types, updating the copy our parent
|
||||
// holds for us as we go along
|
||||
for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
|
||||
{
|
||||
super.sourceMimeTypes.add( mt.toString() );
|
||||
}
|
||||
}
|
||||
|
||||
public TikaSpringConfiguredContentTransformer() {
|
||||
super(new ArrayList<String>());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the Tika parser
|
||||
*/
|
||||
protected Parser getParser()
|
||||
{
|
||||
// If we were given a whole parser, return it
|
||||
if(tikaParser != null)
|
||||
return tikaParser;
|
||||
|
||||
// Otherwise create a new one
|
||||
try {
|
||||
return tikaParserClass.newInstance();
|
||||
} catch (InstantiationException e) {
|
||||
throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user