diff --git a/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.java
new file mode 100644
index 0000000000..6e4447d6c9
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.java
@@ -0,0 +1,116 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see .
+ */
+package org.alfresco.repo.content.metadata;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+
+import org.alfresco.error.AlfrescoRuntimeException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+/**
+ * A Metadata Extractor which makes use of Apache Tika,
+ * and allows the selection of the Tika parser to be
+ * sprung-in to extract the metadata from your document.
+ * This is typically used with custom Tika Parsers.
+
+ *
+ * author: -- cm:author
+ * title: -- cm:title
+ * subject: -- cm:description
+ * created: -- cm:created
+ * comments:
+ * geo:lat: -- cm:latitude
+ *
geo:long: -- cm:longitude
+ *
+ *
+ * @author Nick Burch
+ */
+public class TikaSpringConfiguredMetadataExtracter extends TikaPoweredMetadataExtracter
+{
+ protected static Log logger = LogFactory.getLog(TikaSpringConfiguredMetadataExtracter.class);
+
+ private Parser tikaParser;
+ private String tikaParserClassName;
+ private Class extends Parser> tikaParserClass;
+
+ /**
+ * Injects the name of the Tika parser to use
+ * @param className
+ */
+ @SuppressWarnings("unchecked")
+ public void setTikaParserName(String className)
+ {
+ tikaParserClassName = className;
+
+ // Load the class
+ try {
+ tikaParserClass = (Class extends Parser>)Class.forName(tikaParserClassName);
+ setTikaParser(getParser());
+ } catch(ClassNotFoundException e) {
+ throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
+ }
+ }
+
+ /**
+ * Injects the Tika parser to use
+ * @param parser
+ */
+ public void setTikaParser(Parser tikaParser)
+ {
+ this.tikaParser = tikaParser;
+
+ // Build the mime types, updating the copy our parent
+ // holds for us as we go along
+ ArrayList mimetypes = new ArrayList();
+ for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
+ {
+ mimetypes.add( mt.toString() );
+ }
+ super.setSupportedMimetypes(mimetypes);
+ }
+
+ public TikaSpringConfiguredMetadataExtracter()
+ {
+ super(new HashSet());
+ }
+
+ /**
+ * Returns the Tika parser
+ */
+ protected Parser getParser()
+ {
+ // If we were given a whole parser, return it
+ if(tikaParser != null)
+ return tikaParser;
+
+ // Otherwise create a new one
+ try {
+ return tikaParserClass.newInstance();
+ } catch (InstantiationException e) {
+ throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
+ } catch (IllegalAccessException e) {
+ throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
+ }
+ }
+}
diff --git a/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.properties b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.properties
new file mode 100644
index 0000000000..5a323c946f
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/metadata/TikaSpringConfiguredMetadataExtracter.properties
@@ -0,0 +1,20 @@
+#
+# TikaSpringConfiguredMetadataExtracter.properties - default mapping
+#
+# This is used to map from the Tika and standard namespaces
+# onto your content model. This is used for custom tika parsers,
+# but one file is used across all custom parsers.
+#
+# author: Nick Burch
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+
+# Mappings
+author=cm:author
+title=cm:title
+description=cm:description
+created=cm:created
+
+geo\:lat=cm:latitude
+geo\:long=cm:longitude
diff --git a/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java b/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java
new file mode 100644
index 0000000000..9ad65aa033
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java
@@ -0,0 +1,300 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see .
+ */
+package org.alfresco.repo.content.transform;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+import org.alfresco.error.AlfrescoRuntimeException;
+import org.alfresco.model.ContentModel;
+import org.alfresco.repo.action.ParameterDefinitionImpl;
+import org.alfresco.repo.action.executer.ActionExecuterAbstractBase;
+import org.alfresco.service.cmr.action.Action;
+import org.alfresco.service.cmr.action.ParameterDefinition;
+import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
+import org.alfresco.service.cmr.repository.ContentReader;
+import org.alfresco.service.cmr.repository.ContentService;
+import org.alfresco.service.cmr.repository.ContentWriter;
+import org.alfresco.service.cmr.repository.NodeRef;
+import org.alfresco.service.cmr.repository.NodeService;
+import org.alfresco.service.namespace.QName;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.ContainerAwareDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+
+/**
+ * Warning - this is a prototype service, and will likely change dramatically
+ * in Alfresco 4.0!
+ *
+ * This proto-service provides a way to have Apache Tika extract out
+ * certain kinds of embedded resources from within a container file.
+ *
+ * One use might be to extract all the images in a zip file, another might
+ * be to fetch all the Word Documents embedded in an Excel Spreadsheet.
+ *
+ * Uses the Apache Tika ContainerExtractor framework, along with the
+ * Apache Tika Auto-Parser.
+ *
+ * Not sprung-in by default, you will need to manually list this in
+ * an extension context file.
+ *
+ * @author Nick Burch
+ */
+public class TikaPoweredContainerExtractor
+{
+ private static final Log logger = LogFactory.getLog(TikaPoweredContainerExtractor.class);
+
+ private NodeService nodeService;
+ private ContentService contentService;
+
+ private AutoDetectParser parser;
+ private Detector detector;
+
+ public TikaPoweredContainerExtractor()
+ {
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ detector = new ContainerAwareDetector(
+ config.getMimeRepository()
+ );
+ parser = new AutoDetectParser(detector);
+ }
+
+ /**
+ * Injects the nodeService bean.
+ *
+ * @param nodeService the nodeService.
+ */
+ public void setNodeService(NodeService nodeService)
+ {
+ this.nodeService = nodeService;
+ }
+
+ /**
+ * Injects the contentService bean.
+ *
+ * @param contentService the contentService.
+ */
+ public void setContentService(ContentService contentService)
+ {
+ this.contentService = contentService;
+ }
+
+ /**
+ * Extracts out all the entries from the container
+ * that match the supplied list of mime types.
+ * If no mime types are specified, extracts all
+ * available embedded resources.
+ */
+ public List extract(NodeRef source, List mimetypes)
+ {
+ // Grab the directory to put the nodes into
+ // Will be the parent folder of the source
+ NodeRef folder = nodeService.getPrimaryParent(source).getParentRef();
+
+ // Get the contents
+ ContentReader reader = contentService.getReader(source, ContentModel.PROP_CONTENT);
+ TikaInputStream stream = TikaInputStream.get(reader.getContentInputStream());
+
+ // Build the recursing parser
+ Extractor handler = new Extractor(folder, mimetypes);
+
+ // Have Tika look for things
+ ParserContainerExtractor extractor = new ParserContainerExtractor(
+ parser, detector
+ );
+ try {
+ logger.info("Beginning extraction of " + source.toString());
+ extractor.extract(stream, null, handler);
+ logger.info("Completed extraction of " + source.toString());
+ } catch(TikaException te) {
+ throw new AlfrescoRuntimeException("Extraction Failed", te);
+ } catch(IOException ie) {
+ throw new AlfrescoRuntimeException("Extraction Failed", ie);
+ }
+
+ // Tidy up
+ try {
+ stream.close();
+ } catch(IOException e) {}
+
+ // All done
+ return handler.extracted;
+ }
+
+ /**
+ * This EmbeddedResourceHandler is called by Tika for each
+ * embedded resource. It decides if the resource is to
+ * be extracted or not, and if it is, saves it into the
+ * specified folder.
+ */
+ private class Extractor implements EmbeddedResourceHandler
+ {
+ private List extracted;
+ private Set acceptTypes;
+ private NodeRef folder;
+ private int anonymousCount = 0;
+
+ private Extractor(NodeRef folder, List types)
+ {
+ this.folder = folder;
+ this.extracted = new ArrayList();
+
+ if(types != null && types.size() > 0)
+ {
+ acceptTypes = new HashSet();
+ for(String type : types)
+ {
+ acceptTypes.add(MediaType.parse(type));
+ }
+ }
+ }
+
+ @Override
+ public void handle(String filename, MediaType mediaType,
+ InputStream stream) {
+ // Do we want it?
+ if(acceptTypes == null || acceptTypes.contains(mediaType))
+ {
+ // Ensure we have a filename
+ if(filename == null)
+ {
+ anonymousCount++;
+ filename = "embedded"+anonymousCount+"."+mediaType.getSubtype();
+ }
+
+ logger.info("Extracting embedded " + mediaType + " entry " + filename);
+
+ // Save it
+ Map properties = new HashMap();
+ properties.put(ContentModel.PROP_NAME, filename);
+ NodeRef node = nodeService.createNode(
+ folder,
+ ContentModel.ASSOC_CONTAINS,
+ QName.createQName(filename),
+ ContentModel.TYPE_CONTENT,
+ properties
+ ).getChildRef();
+
+ ContentWriter writer = contentService.getWriter(
+ node, ContentModel.PROP_CONTENT, true
+ );
+ writer.setMimetype(mediaType.toString());
+ writer.putContent(stream);
+ }
+ else
+ {
+ logger.info("Skipping embedded " + mediaType + " entry " + filename);
+ }
+ }
+ }
+
+ /**
+ * This action executor allows you to trigger extraction as an
+ * action, perhaps from a rule.
+ *
+ * Not sprung-in by default, you will need to manually list this in
+ * an extension context file. You will also need to add properties
+ * files entries.
+ */
+ public static class ExtractorActionExecutor extends ActionExecuterAbstractBase
+ {
+ public static final String NAME = "extractEmbeddedResources";
+ public static final String PARAM_MIME_TYPES = "mime-types";
+
+ private TikaPoweredContainerExtractor extractor;
+ public void setTikaPoweredContainerExtractor(TikaPoweredContainerExtractor extractor)
+ {
+ this.extractor = extractor;
+ }
+
+ @Override
+ protected void addParameterDefinitions(List paramList) {
+ paramList.add(new ParameterDefinitionImpl(
+ PARAM_MIME_TYPES,
+ DataTypeDefinition.TEXT,
+ false,
+ getParamDisplayLabel(PARAM_MIME_TYPES)
+ ));
+ }
+
+ @Override
+ protected void executeImpl(Action action, NodeRef actionedUponNodeRef) {
+ List mimeTypes = null;
+ String rawTypes = (String)action.getParameterValue(PARAM_MIME_TYPES);
+ if(rawTypes != null && rawTypes.length() > 0)
+ {
+ mimeTypes = new ArrayList();
+ StringTokenizer st = new StringTokenizer(rawTypes, ",");
+ while(st.hasMoreTokens())
+ {
+ mimeTypes.add( st.nextToken().trim() );
+ }
+ }
+
+ extractor.extract(actionedUponNodeRef, mimeTypes);
+ }
+ }
+/*
+
+
+
+
+
+
+
+ */
+/*
+extractEmbeddedResources.title=Extract embedded resources
+extractEmbeddedResources.description=Extract resources from within container files, such as .zip or .docx
+extractEmbeddedResources.param_mime-types.display-label=Mime Types
+ */
+}
diff --git a/source/java/org/alfresco/repo/content/transform/TikaSpringConfiguredContentTransformer.java b/source/java/org/alfresco/repo/content/transform/TikaSpringConfiguredContentTransformer.java
new file mode 100644
index 0000000000..bace2fde1e
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/transform/TikaSpringConfiguredContentTransformer.java
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see .
+ */
+package org.alfresco.repo.content.transform;
+
+import java.util.ArrayList;
+
+import org.alfresco.error.AlfrescoRuntimeException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+/**
+ * A Content Extractor for XML, HTML and Text, which makes
+ * use of Apache Tika, and allows the selection of the
+ * Tika parser to be sprung-in.
+ * Using spring, you list the Tika parser to use, which
+ * may well not be a standard Tika one. You should specify
+ * either a spring created bean, or a parser class name.
+ *
+ * @author Nick Burch
+ */
+public class TikaSpringConfiguredContentTransformer extends TikaPoweredContentTransformer
+{
+ private Parser tikaParser;
+ private String tikaParserClassName;
+ private Class extends Parser> tikaParserClass;
+
+ /**
+ * Injects the name of the Tika parser to use
+ * @param className
+ */
+ @SuppressWarnings("unchecked")
+ public void setTikaParserName(String className)
+ {
+ tikaParserClassName = className;
+
+ // Load the class
+ try {
+ tikaParserClass = (Class extends Parser>)Class.forName(tikaParserClassName);
+ setTikaParser(getParser());
+ } catch(ClassNotFoundException e) {
+ throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
+ }
+ }
+
+ /**
+ * Injects the Tika parser to use
+ * @param parser
+ */
+ public void setTikaParser(Parser tikaParser)
+ {
+ this.tikaParser = tikaParser;
+
+ // Build the mime types, updating the copy our parent
+ // holds for us as we go along
+ for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
+ {
+ super.sourceMimeTypes.add( mt.toString() );
+ }
+ }
+
+ public TikaSpringConfiguredContentTransformer() {
+ super(new ArrayList());
+ }
+
+ /**
+ * Returns the Tika parser
+ */
+ protected Parser getParser()
+ {
+ // If we were given a whole parser, return it
+ if(tikaParser != null)
+ return tikaParser;
+
+ // Otherwise create a new one
+ try {
+ return tikaParserClass.newInstance();
+ } catch (InstantiationException e) {
+ throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
+ } catch (IllegalAccessException e) {
+ throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
+ }
+ }
+}