diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml
index 02b5fe1441..2ba7559960 100644
--- a/config/alfresco/content-services-context.xml
+++ b/config/alfresco/content-services-context.xml
@@ -504,6 +504,7 @@
+
diff --git a/config/alfresco/repository.properties b/config/alfresco/repository.properties
index 8f309363ee..2b7fdb73cb 100644
--- a/config/alfresco/repository.properties
+++ b/config/alfresco/repository.properties
@@ -245,6 +245,11 @@ lucene.write.lock.timeout=10000
lucene.commit.lock.timeout=100000
lucene.lock.poll.interval=100
+# When transforming archive files (.zip etc) into text representations (such as
+# for full text indexing), should the files within the archive be processed too?
+# If enabled, transformation takes longer, but searches of the files find more.
+transformer.Archive.includeContents=false
+
# Database configuration
db.schema.stopAfterSchemaBootstrap=false
db.schema.update=true
diff --git a/config/alfresco/swf-transform-context.xml b/config/alfresco/swf-transform-context.xml
index 436e26ca44..a6d0e796be 100644
--- a/config/alfresco/swf-transform-context.xml
+++ b/config/alfresco/swf-transform-context.xml
@@ -74,6 +74,11 @@
application/pdf
+
+
+
diff --git a/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformer.java b/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformer.java
index 5396939664..2ce8b76b44 100644
--- a/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformer.java
@@ -18,106 +18,96 @@
*/
package org.alfresco.repo.content.transform;
-import java.io.InputStream;
import java.util.ArrayList;
-import java.util.List;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
-import org.alfresco.repo.content.MimetypeMap;
-import org.alfresco.service.cmr.repository.ContentReader;
-import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.pkg.PackageParser;
/**
- * This class transforms archive files (currently only ZIPs) to text, which enables indexing
- * and searching of archives as well as webpreviewing.
- * The transformation simply lists the names of the entries within the zip file and does not consider their content.
+ * This class transforms archive files (zip, tar etc) to text, which enables indexing
+ * and searching of archives as well as webpreviewing.
+ * The transformation can simply list the names of the entries within the archive, or
+ * it can also include the textual content of the entries themselves.
+ * The former is suggested for web preview, the latter for indexing.
+ * This behaviour is controlled by the recurse flag.
*
* @author Neil McErlean
- * @since Swift
+ * @author Nick Burch
+ * @since 3.4
*/
-public class ArchiveContentTransformer extends AbstractContentTransformer2
-{
+public class ArchiveContentTransformer extends TikaPoweredContentTransformer
+{
/**
* The logger
*/
private static Log logger = LogFactory.getLog(ArchiveContentTransformer.class);
- /**
- * Currently the only transformation performed is that of text extraction from PDF documents.
+ private boolean includeContents = false;
+ public void setIncludeContents(String includeContents)
+ {
+ // Spring really ought to be able to handle
+ // setting a boolean that might still be
+ // ${foo} (i.e. not overridden in a property).
+ // As we can't do that with spring, we do it...
+ this.includeContents = false;
+ if(includeContents != null && includeContents.length() > 0)
+ {
+ if(includeContents.equalsIgnoreCase("true") ||
+ includeContents.equalsIgnoreCase("t") ||
+ includeContents.equalsIgnoreCase("yes") ||
+ includeContents.equalsIgnoreCase("y"))
+ {
+ this.includeContents = true;
+ }
+ }
+ }
+
+ /**
+ * We support all the archive mimetypes that the Tika
+ * office parser can handle
*/
- public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
- {
- // TODO: Expand to other archive types e.g. tar.
- if (!MimetypeMap.MIMETYPE_ZIP.equals(sourceMimetype) ||
- !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
- {
- // Currently only support ZIP -> Text
- return false;
- }
- else
- {
- return true;
- }
+ public static ArrayList SUPPORTED_MIMETYPES;
+ static {
+ SUPPORTED_MIMETYPES = new ArrayList();
+ Parser p = new PackageParser();
+ for(MediaType mt : p.getSupportedTypes(null)) {
+ // Tika can probably do some useful text
+ SUPPORTED_MIMETYPES.add( mt.toString() );
+ }
+ }
+
+ public ArchiveContentTransformer() {
+ super(SUPPORTED_MIMETYPES);
+ }
+
+ @Override
+ protected Parser getParser() {
+ return new PackageParser();
}
- protected void transformInternal(
- ContentReader reader,
- ContentWriter writer,
- TransformationOptions options) throws Exception
- {
- InputStream is = null;
- try
- {
- is = reader.getContentInputStream();
-
- List zipEntryNames = new ArrayList();
- ZipInputStream zin = new ZipInputStream(is);
-
- // Enumerate each entry
- ZipEntry nextZipEntry = null;
- while ((nextZipEntry = zin.getNextEntry()) != null)
- {
- String entryName = nextZipEntry.getName();
- zipEntryNames.add(entryName);
-
- // Currently we do not recurse into 'zips within zips'.
- }
-
- if (logger.isDebugEnabled())
- {
- StringBuilder msg = new StringBuilder();
- msg.append("Transformed ")
- .append(zipEntryNames.size())
- .append(zipEntryNames.size() == 1 ? " zip entry" : " zip entries");
- logger.debug(msg.toString());
- }
-
- String text = createTextContentFrom(zipEntryNames);
-
- // dump it all to the writer
- writer.putContent(text);
- }
- finally
- {
- if (is != null)
- {
- try { is.close(); } catch (Throwable e) {e.printStackTrace(); }
- }
- }
- }
-
- private String createTextContentFrom(List zipEntryNames)
- {
- StringBuilder result = new StringBuilder();
- for (String entryName : zipEntryNames)
- {
- result.append(entryName)
- .append('\n');
- }
- return result.toString();
+ @Override
+ protected ParseContext buildParseContext(Metadata metadata,
+ String targetMimeType, TransformationOptions options) {
+ ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
+
+ boolean recurse = includeContents;
+ if(options.getIncludeEmbedded() != null)
+ {
+ recurse = options.getIncludeEmbedded();
+ }
+System.err.println(includeContents + " " + recurse + " " + options.getIncludeEmbedded());
+ if(recurse)
+ {
+ context.set(Parser.class, new AutoDetectParser());
+ }
+
+ return context;
}
}
diff --git a/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformerTest.java
index 0bfc128a3b..765950ca0a 100644
--- a/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformerTest.java
@@ -18,7 +18,14 @@
*/
package org.alfresco.repo.content.transform;
+import java.io.File;
+import java.io.IOException;
+
import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.repo.content.filestore.FileContentReader;
+import org.alfresco.repo.content.filestore.FileContentWriter;
+import org.alfresco.service.cmr.repository.ContentReader;
+import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
/**
@@ -30,7 +37,7 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
*/
public class ArchiveContentTransformerTest extends AbstractContentTransformerTest
{
- private ContentTransformer transformer;
+ private ArchiveContentTransformer transformer;
@Override
public void setUp() throws Exception
@@ -48,19 +55,124 @@ public class ArchiveContentTransformerTest extends AbstractContentTransformerTes
public void testIsTransformable() throws Exception
{
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_ZIP, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertTrue(transformer.isTransformable("application/x-tar", MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertTrue(transformer.isTransformable("application/x-gtar", MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
}
@Override
- protected boolean isQuickPhraseExpected(String targetMimetype)
- {
- // The Zip transformer produces names of the entries, not their contents.
- return false;
- }
+ protected boolean isQuickPhraseExpected(String targetMimetype)
+ {
+ // The Zip transformer produces names of the entries, not their contents.
+ return false;
+ }
@Override
- protected boolean isQuickWordsExpected(String targetMimetype)
+ protected boolean isQuickWordsExpected(String targetMimetype)
{
- // The Zip transformer produces names of the entries, not their contents.
- return false;
+ // The Zip transformer produces names of the entries, not their contents.
+ return false;
+ }
+
+ public void testRecursing() throws Exception
+ {
+ ContentWriter writer;
+ String contents;
+
+ // Bean off, no options
+ transformer.setIncludeContents("FALSE");
+
+ writer = getTestWriter();
+ transformer.transform(getTestReader(), writer);
+ contents = writer.getReader().getContentString();
+ testHasFiles(contents);
+ testNested(contents, false);
+
+
+ // Bean on, no options
+ transformer.setIncludeContents("TRUE");
+
+ writer = getTestWriter();
+ transformer.transform(getTestReader(), writer);
+ contents = writer.getReader().getContentString();
+ testHasFiles(contents);
+ testNested(contents, true);
+
+
+ // Bean off, Transformation Options off
+ TransformationOptions options = new TransformationOptions();
+ transformer.setIncludeContents("FALSE");
+
+ writer = getTestWriter();
+ transformer.transform(getTestReader(), writer, options);
+ contents = writer.getReader().getContentString();
+ testHasFiles(contents);
+ testNested(contents, false);
+
+
+ // Bean on, Transformation Options off
+ transformer.setIncludeContents("TRUE");
+
+ writer = getTestWriter();
+ transformer.transform(getTestReader(), writer, options);
+ contents = writer.getReader().getContentString();
+ testHasFiles(contents);
+ testNested(contents, true);
+
+
+ // Bean off, Transformation Options on - options win
+ options.setIncludeEmbedded(true);
+ transformer.setIncludeContents("FALSE");
+
+ writer = getTestWriter();
+ transformer.transform(getTestReader(), writer, options);
+ contents = writer.getReader().getContentString();
+ testHasFiles(contents);
+ testNested(contents, true);
+
+
+ // Bean on, Transformation Options on
+ transformer.setIncludeContents("TRUE");
+
+ writer = getTestWriter();
+ transformer.transform(getTestReader(), writer, options);
+ contents = writer.getReader().getContentString();
+ testHasFiles(contents);
+ testNested(contents, true);
+ }
+ private ContentReader getTestReader() throws IOException {
+ ContentReader sourceReader = new FileContentReader(
+ loadQuickTestFile("zip")
+ );
+ sourceReader.setMimetype(MimetypeMap.MIMETYPE_ZIP);
+ return sourceReader;
+ }
+ private ContentWriter getTestWriter() throws IOException {
+ ContentWriter writer = new FileContentWriter(File.createTempFile("test", ".txt"));
+ writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
+ return writer;
+ }
+ private void testHasFiles(String contents)
+ {
+ assertTrue("Files not found in " + contents,
+ contents.contains("quick.txt"));
+ assertTrue("Files not found in " + contents,
+ contents.contains("quick.doc"));
+ assertTrue("Files not found in " + contents,
+ contents.contains("subfolder/quick.jpg"));
+ }
+ private void testNested(String contents, boolean shouldHaveRecursed)
+ {
+ assertEquals(
+ "Recursion was " + shouldHaveRecursed +
+ " but content was " + contents,
+ shouldHaveRecursed,
+ contents.contains("The quick brown fox jumps over the lazy dog")
+ );
+ assertEquals(
+ "Recursion was " + shouldHaveRecursed +
+ " but content was " + contents,
+ shouldHaveRecursed,
+ contents.contains("Le renard brun rapide saute par-dessus le chien paresseux")
+ );
}
}
diff --git a/source/java/org/alfresco/repo/content/transform/ComplexContentTransformer.java b/source/java/org/alfresco/repo/content/transform/ComplexContentTransformer.java
index a9ed83523a..e44e33be45 100644
--- a/source/java/org/alfresco/repo/content/transform/ComplexContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/ComplexContentTransformer.java
@@ -19,8 +19,13 @@
package org.alfresco.repo.content.transform;
import java.io.File;
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
+
+import javax.faces.el.MethodNotFoundException;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.repo.content.filestore.FileContentWriter;
@@ -28,6 +33,10 @@ import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.alfresco.util.TempFileProvider;
+import org.apache.commons.beanutils.PropertyUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.springframework.beans.BeanUtils;
import org.springframework.beans.factory.InitializingBean;
/**
@@ -38,8 +47,14 @@ import org.springframework.beans.factory.InitializingBean;
*/
public class ComplexContentTransformer extends AbstractContentTransformer2 implements InitializingBean
{
+ /**
+ * The logger
+ */
+ private static Log logger = LogFactory.getLog(ComplexContentTransformer.class);
+
private List transformers;
private List intermediateMimetypes;
+ private Map transformationOptionOverrides;
public ComplexContentTransformer()
{
@@ -74,6 +89,20 @@ public class ComplexContentTransformer extends AbstractContentTransformer2 imple
}
/**
+ * Sets any properties to be set on the TransformationOption as passed in.
+ * This allows you to force certain properties to always be set on it,
+ * to control the transformers in a different way to their default.
+ * Note that only properties that are supported by the passed-in
+ * {@link TransformationOptions} are changed, others are ignored.
+ * @param transformationOptionOverrides
+ */
+ public void setTransformationOptionOverrides(
+ Map transformationOptionOverrides)
+ {
+ this.transformationOptionOverrides = transformationOptionOverrides;
+ }
+
+ /**
* Ensures that required properties have been set
*/
public void afterPropertiesSet() throws Exception
@@ -103,6 +132,29 @@ public class ComplexContentTransformer extends AbstractContentTransformer2 imple
boolean result = true;
String currentSourceMimetype = sourceMimetype;
+ // Set any transformation options overrides if we can
+ if(options != null && transformationOptionOverrides != null)
+ {
+ for(String key : transformationOptionOverrides.keySet())
+ {
+ if(PropertyUtils.isWriteable(options, key))
+ {
+ try
+ {
+ PropertyUtils.setProperty(options, key, transformationOptionOverrides.get(key));
+ }
+ catch(MethodNotFoundException mnfe) {}
+ catch(NoSuchMethodException nsme) {}
+ catch(InvocationTargetException ite) {}
+ catch(IllegalAccessException iae) {}
+ }
+ else
+ {
+ logger.warn("Unable to set override Transformation Option " + key + " on " + options);
+ }
+ }
+ }
+
Iterator transformerIterator = transformers.iterator();
Iterator intermediateMimetypeIterator = intermediateMimetypes.iterator();
while (transformerIterator.hasNext())