diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml index 02b5fe1441..2ba7559960 100644 --- a/config/alfresco/content-services-context.xml +++ b/config/alfresco/content-services-context.xml @@ -504,6 +504,7 @@ + diff --git a/config/alfresco/repository.properties b/config/alfresco/repository.properties index 8f309363ee..2b7fdb73cb 100644 --- a/config/alfresco/repository.properties +++ b/config/alfresco/repository.properties @@ -245,6 +245,11 @@ lucene.write.lock.timeout=10000 lucene.commit.lock.timeout=100000 lucene.lock.poll.interval=100 +# When transforming archive files (.zip etc) into text representations (such as +# for full text indexing), should the files within the archive be processed too? +# If enabled, transformation takes longer, but searches of the files find more. +transformer.Archive.includeContents=false + # Database configuration db.schema.stopAfterSchemaBootstrap=false db.schema.update=true diff --git a/config/alfresco/swf-transform-context.xml b/config/alfresco/swf-transform-context.xml index 436e26ca44..a6d0e796be 100644 --- a/config/alfresco/swf-transform-context.xml +++ b/config/alfresco/swf-transform-context.xml @@ -74,6 +74,11 @@ application/pdf + + + + + diff --git a/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformer.java b/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformer.java index 5396939664..2ce8b76b44 100644 --- a/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformer.java @@ -18,106 +18,96 @@ */ package org.alfresco.repo.content.transform; -import java.io.InputStream; import java.util.ArrayList; -import java.util.List; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; -import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentReader; -import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.TransformationOptions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.pkg.PackageParser; /** - * This class transforms archive files (currently only ZIPs) to text, which enables indexing - * and searching of archives as well as webpreviewing. - * The transformation simply lists the names of the entries within the zip file and does not consider their content. + * This class transforms archive files (zip, tar etc) to text, which enables indexing + * and searching of archives as well as webpreviewing. + * The transformation can simply list the names of the entries within the archive, or + * it can also include the textual content of the entries themselves. + * The former is suggested for web preview, the latter for indexing. + * This behaviour is controlled by the recurse flag. * * @author Neil McErlean - * @since Swift + * @author Nick Burch + * @since 3.4 */ -public class ArchiveContentTransformer extends AbstractContentTransformer2 -{ +public class ArchiveContentTransformer extends TikaPoweredContentTransformer +{ /** * The logger */ private static Log logger = LogFactory.getLog(ArchiveContentTransformer.class); - /** - * Currently the only transformation performed is that of text extraction from PDF documents. + private boolean includeContents = false; + public void setIncludeContents(String includeContents) + { + // Spring really ought to be able to handle + // setting a boolean that might still be + // ${foo} (i.e. not overridden in a property). + // As we can't do that with spring, we do it... + this.includeContents = false; + if(includeContents != null && includeContents.length() > 0) + { + if(includeContents.equalsIgnoreCase("true") || + includeContents.equalsIgnoreCase("t") || + includeContents.equalsIgnoreCase("yes") || + includeContents.equalsIgnoreCase("y")) + { + this.includeContents = true; + } + } + } + + /** + * We support all the archive mimetypes that the Tika + * office parser can handle */ - public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options) - { - // TODO: Expand to other archive types e.g. tar. - if (!MimetypeMap.MIMETYPE_ZIP.equals(sourceMimetype) || - !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype)) - { - // Currently only support ZIP -> Text - return false; - } - else - { - return true; - } + public static ArrayList SUPPORTED_MIMETYPES; + static { + SUPPORTED_MIMETYPES = new ArrayList(); + Parser p = new PackageParser(); + for(MediaType mt : p.getSupportedTypes(null)) { + // Tika can probably do some useful text + SUPPORTED_MIMETYPES.add( mt.toString() ); + } + } + + public ArchiveContentTransformer() { + super(SUPPORTED_MIMETYPES); + } + + @Override + protected Parser getParser() { + return new PackageParser(); } - protected void transformInternal( - ContentReader reader, - ContentWriter writer, - TransformationOptions options) throws Exception - { - InputStream is = null; - try - { - is = reader.getContentInputStream(); - - List zipEntryNames = new ArrayList(); - ZipInputStream zin = new ZipInputStream(is); - - // Enumerate each entry - ZipEntry nextZipEntry = null; - while ((nextZipEntry = zin.getNextEntry()) != null) - { - String entryName = nextZipEntry.getName(); - zipEntryNames.add(entryName); - - // Currently we do not recurse into 'zips within zips'. - } - - if (logger.isDebugEnabled()) - { - StringBuilder msg = new StringBuilder(); - msg.append("Transformed ") - .append(zipEntryNames.size()) - .append(zipEntryNames.size() == 1 ? " zip entry" : " zip entries"); - logger.debug(msg.toString()); - } - - String text = createTextContentFrom(zipEntryNames); - - // dump it all to the writer - writer.putContent(text); - } - finally - { - if (is != null) - { - try { is.close(); } catch (Throwable e) {e.printStackTrace(); } - } - } - } - - private String createTextContentFrom(List zipEntryNames) - { - StringBuilder result = new StringBuilder(); - for (String entryName : zipEntryNames) - { - result.append(entryName) - .append('\n'); - } - return result.toString(); + @Override + protected ParseContext buildParseContext(Metadata metadata, + String targetMimeType, TransformationOptions options) { + ParseContext context = super.buildParseContext(metadata, targetMimeType, options); + + boolean recurse = includeContents; + if(options.getIncludeEmbedded() != null) + { + recurse = options.getIncludeEmbedded(); + } +System.err.println(includeContents + " " + recurse + " " + options.getIncludeEmbedded()); + if(recurse) + { + context.set(Parser.class, new AutoDetectParser()); + } + + return context; } } diff --git a/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformerTest.java index 0bfc128a3b..765950ca0a 100644 --- a/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformerTest.java +++ b/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformerTest.java @@ -18,7 +18,14 @@ */ package org.alfresco.repo.content.transform; +import java.io.File; +import java.io.IOException; + import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.repo.content.filestore.FileContentReader; +import org.alfresco.repo.content.filestore.FileContentWriter; +import org.alfresco.service.cmr.repository.ContentReader; +import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.TransformationOptions; /** @@ -30,7 +37,7 @@ import org.alfresco.service.cmr.repository.TransformationOptions; */ public class ArchiveContentTransformerTest extends AbstractContentTransformerTest { - private ContentTransformer transformer; + private ArchiveContentTransformer transformer; @Override public void setUp() throws Exception @@ -48,19 +55,124 @@ public class ArchiveContentTransformerTest extends AbstractContentTransformerTes public void testIsTransformable() throws Exception { assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_ZIP, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable("application/x-tar", MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable("application/x-gtar", MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); } @Override - protected boolean isQuickPhraseExpected(String targetMimetype) - { - // The Zip transformer produces names of the entries, not their contents. - return false; - } + protected boolean isQuickPhraseExpected(String targetMimetype) + { + // The Zip transformer produces names of the entries, not their contents. + return false; + } @Override - protected boolean isQuickWordsExpected(String targetMimetype) + protected boolean isQuickWordsExpected(String targetMimetype) { - // The Zip transformer produces names of the entries, not their contents. - return false; + // The Zip transformer produces names of the entries, not their contents. + return false; + } + + public void testRecursing() throws Exception + { + ContentWriter writer; + String contents; + + // Bean off, no options + transformer.setIncludeContents("FALSE"); + + writer = getTestWriter(); + transformer.transform(getTestReader(), writer); + contents = writer.getReader().getContentString(); + testHasFiles(contents); + testNested(contents, false); + + + // Bean on, no options + transformer.setIncludeContents("TRUE"); + + writer = getTestWriter(); + transformer.transform(getTestReader(), writer); + contents = writer.getReader().getContentString(); + testHasFiles(contents); + testNested(contents, true); + + + // Bean off, Transformation Options off + TransformationOptions options = new TransformationOptions(); + transformer.setIncludeContents("FALSE"); + + writer = getTestWriter(); + transformer.transform(getTestReader(), writer, options); + contents = writer.getReader().getContentString(); + testHasFiles(contents); + testNested(contents, false); + + + // Bean on, Transformation Options off + transformer.setIncludeContents("TRUE"); + + writer = getTestWriter(); + transformer.transform(getTestReader(), writer, options); + contents = writer.getReader().getContentString(); + testHasFiles(contents); + testNested(contents, true); + + + // Bean off, Transformation Options on - options win + options.setIncludeEmbedded(true); + transformer.setIncludeContents("FALSE"); + + writer = getTestWriter(); + transformer.transform(getTestReader(), writer, options); + contents = writer.getReader().getContentString(); + testHasFiles(contents); + testNested(contents, true); + + + // Bean on, Transformation Options on + transformer.setIncludeContents("TRUE"); + + writer = getTestWriter(); + transformer.transform(getTestReader(), writer, options); + contents = writer.getReader().getContentString(); + testHasFiles(contents); + testNested(contents, true); + } + private ContentReader getTestReader() throws IOException { + ContentReader sourceReader = new FileContentReader( + loadQuickTestFile("zip") + ); + sourceReader.setMimetype(MimetypeMap.MIMETYPE_ZIP); + return sourceReader; + } + private ContentWriter getTestWriter() throws IOException { + ContentWriter writer = new FileContentWriter(File.createTempFile("test", ".txt")); + writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN); + return writer; + } + private void testHasFiles(String contents) + { + assertTrue("Files not found in " + contents, + contents.contains("quick.txt")); + assertTrue("Files not found in " + contents, + contents.contains("quick.doc")); + assertTrue("Files not found in " + contents, + contents.contains("subfolder/quick.jpg")); + } + private void testNested(String contents, boolean shouldHaveRecursed) + { + assertEquals( + "Recursion was " + shouldHaveRecursed + + " but content was " + contents, + shouldHaveRecursed, + contents.contains("The quick brown fox jumps over the lazy dog") + ); + assertEquals( + "Recursion was " + shouldHaveRecursed + + " but content was " + contents, + shouldHaveRecursed, + contents.contains("Le renard brun rapide saute par-dessus le chien paresseux") + ); } } diff --git a/source/java/org/alfresco/repo/content/transform/ComplexContentTransformer.java b/source/java/org/alfresco/repo/content/transform/ComplexContentTransformer.java index a9ed83523a..e44e33be45 100644 --- a/source/java/org/alfresco/repo/content/transform/ComplexContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/ComplexContentTransformer.java @@ -19,8 +19,13 @@ package org.alfresco.repo.content.transform; import java.io.File; +import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; import java.util.Iterator; import java.util.List; +import java.util.Map; + +import javax.faces.el.MethodNotFoundException; import org.alfresco.error.AlfrescoRuntimeException; import org.alfresco.repo.content.filestore.FileContentWriter; @@ -28,6 +33,10 @@ import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.TransformationOptions; import org.alfresco.util.TempFileProvider; +import org.apache.commons.beanutils.PropertyUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.springframework.beans.BeanUtils; import org.springframework.beans.factory.InitializingBean; /** @@ -38,8 +47,14 @@ import org.springframework.beans.factory.InitializingBean; */ public class ComplexContentTransformer extends AbstractContentTransformer2 implements InitializingBean { + /** + * The logger + */ + private static Log logger = LogFactory.getLog(ComplexContentTransformer.class); + private List transformers; private List intermediateMimetypes; + private Map transformationOptionOverrides; public ComplexContentTransformer() { @@ -74,6 +89,20 @@ public class ComplexContentTransformer extends AbstractContentTransformer2 imple } /** + * Sets any properties to be set on the TransformationOption as passed in. + * This allows you to force certain properties to always be set on it, + * to control the transformers in a different way to their default. + * Note that only properties that are supported by the passed-in + * {@link TransformationOptions} are changed, others are ignored. + * @param transformationOptionOverrides + */ + public void setTransformationOptionOverrides( + Map transformationOptionOverrides) + { + this.transformationOptionOverrides = transformationOptionOverrides; + } + + /** * Ensures that required properties have been set */ public void afterPropertiesSet() throws Exception @@ -103,6 +132,29 @@ public class ComplexContentTransformer extends AbstractContentTransformer2 imple boolean result = true; String currentSourceMimetype = sourceMimetype; + // Set any transformation options overrides if we can + if(options != null && transformationOptionOverrides != null) + { + for(String key : transformationOptionOverrides.keySet()) + { + if(PropertyUtils.isWriteable(options, key)) + { + try + { + PropertyUtils.setProperty(options, key, transformationOptionOverrides.get(key)); + } + catch(MethodNotFoundException mnfe) {} + catch(NoSuchMethodException nsme) {} + catch(InvocationTargetException ite) {} + catch(IllegalAccessException iae) {} + } + else + { + logger.warn("Unable to set override Transformation Option " + key + " on " + options); + } + } + } + Iterator transformerIterator = transformers.iterator(); Iterator intermediateMimetypeIterator = intermediateMimetypes.iterator(); while (transformerIterator.hasNext())