mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-07 18:25:23 +00:00
Convert the archive transformer to use Tika, so that other archive formats (eg tar) are supported
Includes a configuration option (off by default) to recurse into embedded resources, along with unit tests for this Also add support to the ComplexContentTransformer for tweaking TransformOptions, and use this to ensure that the SWF plain-text preview of archive files will only ever be the filenames, and not the text of their contents too git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22860 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
parent
2c64a653e5
commit
2d690eb109
@ -504,6 +504,7 @@
|
|||||||
</bean>
|
</bean>
|
||||||
</list>
|
</list>
|
||||||
</property>
|
</property>
|
||||||
|
<property name="includeContents" value="${transformer.Archive.includeContents}" />
|
||||||
</bean>
|
</bean>
|
||||||
|
|
||||||
</beans>
|
</beans>
|
||||||
|
@ -245,6 +245,11 @@ lucene.write.lock.timeout=10000
|
|||||||
lucene.commit.lock.timeout=100000
|
lucene.commit.lock.timeout=100000
|
||||||
lucene.lock.poll.interval=100
|
lucene.lock.poll.interval=100
|
||||||
|
|
||||||
|
# When transforming archive files (.zip etc) into text representations (such as
|
||||||
|
# for full text indexing), should the files within the archive be processed too?
|
||||||
|
# If enabled, transformation takes longer, but searches of the files find more.
|
||||||
|
transformer.Archive.includeContents=false
|
||||||
|
|
||||||
# Database configuration
|
# Database configuration
|
||||||
db.schema.stopAfterSchemaBootstrap=false
|
db.schema.stopAfterSchemaBootstrap=false
|
||||||
db.schema.update=true
|
db.schema.update=true
|
||||||
|
@ -74,6 +74,11 @@
|
|||||||
<value>application/pdf</value>
|
<value>application/pdf</value>
|
||||||
</list>
|
</list>
|
||||||
</property>
|
</property>
|
||||||
|
<property name="transformationOptionOverrides">
|
||||||
|
<map>
|
||||||
|
<entry key="includeContents" value="no" />
|
||||||
|
</map>
|
||||||
|
</property>
|
||||||
</bean>
|
</bean>
|
||||||
|
|
||||||
</beans>
|
</beans>
|
||||||
|
@ -18,106 +18,96 @@
|
|||||||
*/
|
*/
|
||||||
package org.alfresco.repo.content.transform;
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
|
||||||
import java.util.zip.ZipEntry;
|
|
||||||
import java.util.zip.ZipInputStream;
|
|
||||||
|
|
||||||
import org.alfresco.repo.content.MimetypeMap;
|
|
||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
|
||||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
|
||||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.tika.metadata.Metadata;
|
||||||
|
import org.apache.tika.mime.MediaType;
|
||||||
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
|
import org.apache.tika.parser.ParseContext;
|
||||||
|
import org.apache.tika.parser.Parser;
|
||||||
|
import org.apache.tika.parser.pkg.PackageParser;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class transforms archive files (currently only ZIPs) to text, which enables indexing
|
* This class transforms archive files (zip, tar etc) to text, which enables indexing
|
||||||
* and searching of archives as well as webpreviewing.
|
* and searching of archives as well as webpreviewing.
|
||||||
* The transformation simply lists the names of the entries within the zip file and does not consider their content.
|
* The transformation can simply list the names of the entries within the archive, or
|
||||||
|
* it can also include the textual content of the entries themselves.
|
||||||
|
* The former is suggested for web preview, the latter for indexing.
|
||||||
|
* This behaviour is controlled by the recurse flag.
|
||||||
*
|
*
|
||||||
* @author Neil McErlean
|
* @author Neil McErlean
|
||||||
* @since Swift
|
* @author Nick Burch
|
||||||
|
* @since 3.4
|
||||||
*/
|
*/
|
||||||
public class ArchiveContentTransformer extends AbstractContentTransformer2
|
public class ArchiveContentTransformer extends TikaPoweredContentTransformer
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* The logger
|
* The logger
|
||||||
*/
|
*/
|
||||||
private static Log logger = LogFactory.getLog(ArchiveContentTransformer.class);
|
private static Log logger = LogFactory.getLog(ArchiveContentTransformer.class);
|
||||||
|
|
||||||
|
private boolean includeContents = false;
|
||||||
|
public void setIncludeContents(String includeContents)
|
||||||
|
{
|
||||||
|
// Spring really ought to be able to handle
|
||||||
|
// setting a boolean that might still be
|
||||||
|
// ${foo} (i.e. not overridden in a property).
|
||||||
|
// As we can't do that with spring, we do it...
|
||||||
|
this.includeContents = false;
|
||||||
|
if(includeContents != null && includeContents.length() > 0)
|
||||||
|
{
|
||||||
|
if(includeContents.equalsIgnoreCase("true") ||
|
||||||
|
includeContents.equalsIgnoreCase("t") ||
|
||||||
|
includeContents.equalsIgnoreCase("yes") ||
|
||||||
|
includeContents.equalsIgnoreCase("y"))
|
||||||
|
{
|
||||||
|
this.includeContents = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Currently the only transformation performed is that of text extraction from PDF documents.
|
* We support all the archive mimetypes that the Tika
|
||||||
|
* office parser can handle
|
||||||
*/
|
*/
|
||||||
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
|
public static ArrayList<String> SUPPORTED_MIMETYPES;
|
||||||
{
|
static {
|
||||||
// TODO: Expand to other archive types e.g. tar.
|
SUPPORTED_MIMETYPES = new ArrayList<String>();
|
||||||
if (!MimetypeMap.MIMETYPE_ZIP.equals(sourceMimetype) ||
|
Parser p = new PackageParser();
|
||||||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
for(MediaType mt : p.getSupportedTypes(null)) {
|
||||||
{
|
// Tika can probably do some useful text
|
||||||
// Currently only support ZIP -> Text
|
SUPPORTED_MIMETYPES.add( mt.toString() );
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void transformInternal(
|
public ArchiveContentTransformer() {
|
||||||
ContentReader reader,
|
super(SUPPORTED_MIMETYPES);
|
||||||
ContentWriter writer,
|
|
||||||
TransformationOptions options) throws Exception
|
|
||||||
{
|
|
||||||
InputStream is = null;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
is = reader.getContentInputStream();
|
|
||||||
|
|
||||||
List<String> zipEntryNames = new ArrayList<String>();
|
|
||||||
ZipInputStream zin = new ZipInputStream(is);
|
|
||||||
|
|
||||||
// Enumerate each entry
|
|
||||||
ZipEntry nextZipEntry = null;
|
|
||||||
while ((nextZipEntry = zin.getNextEntry()) != null)
|
|
||||||
{
|
|
||||||
String entryName = nextZipEntry.getName();
|
|
||||||
zipEntryNames.add(entryName);
|
|
||||||
|
|
||||||
// Currently we do not recurse into 'zips within zips'.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (logger.isDebugEnabled())
|
@Override
|
||||||
{
|
protected Parser getParser() {
|
||||||
StringBuilder msg = new StringBuilder();
|
return new PackageParser();
|
||||||
msg.append("Transformed ")
|
|
||||||
.append(zipEntryNames.size())
|
|
||||||
.append(zipEntryNames.size() == 1 ? " zip entry" : " zip entries");
|
|
||||||
logger.debug(msg.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
String text = createTextContentFrom(zipEntryNames);
|
@Override
|
||||||
|
protected ParseContext buildParseContext(Metadata metadata,
|
||||||
|
String targetMimeType, TransformationOptions options) {
|
||||||
|
ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
|
||||||
|
|
||||||
// dump it all to the writer
|
boolean recurse = includeContents;
|
||||||
writer.putContent(text);
|
if(options.getIncludeEmbedded() != null)
|
||||||
}
|
|
||||||
finally
|
|
||||||
{
|
{
|
||||||
if (is != null)
|
recurse = options.getIncludeEmbedded();
|
||||||
|
}
|
||||||
|
System.err.println(includeContents + " " + recurse + " " + options.getIncludeEmbedded());
|
||||||
|
if(recurse)
|
||||||
{
|
{
|
||||||
try { is.close(); } catch (Throwable e) {e.printStackTrace(); }
|
context.set(Parser.class, new AutoDetectParser());
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String createTextContentFrom(List<String> zipEntryNames)
|
return context;
|
||||||
{
|
|
||||||
StringBuilder result = new StringBuilder();
|
|
||||||
for (String entryName : zipEntryNames)
|
|
||||||
{
|
|
||||||
result.append(entryName)
|
|
||||||
.append('\n');
|
|
||||||
}
|
|
||||||
return result.toString();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,14 @@
|
|||||||
*/
|
*/
|
||||||
package org.alfresco.repo.content.transform;
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.alfresco.repo.content.MimetypeMap;
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
|
import org.alfresco.repo.content.filestore.FileContentReader;
|
||||||
|
import org.alfresco.repo.content.filestore.FileContentWriter;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -30,7 +37,7 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
|
|||||||
*/
|
*/
|
||||||
public class ArchiveContentTransformerTest extends AbstractContentTransformerTest
|
public class ArchiveContentTransformerTest extends AbstractContentTransformerTest
|
||||||
{
|
{
|
||||||
private ContentTransformer transformer;
|
private ArchiveContentTransformer transformer;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception
|
public void setUp() throws Exception
|
||||||
@ -48,6 +55,8 @@ public class ArchiveContentTransformerTest extends AbstractContentTransformerTes
|
|||||||
public void testIsTransformable() throws Exception
|
public void testIsTransformable() throws Exception
|
||||||
{
|
{
|
||||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_ZIP, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_ZIP, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable("application/x-tar", MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable("application/x-gtar", MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -63,4 +72,107 @@ public class ArchiveContentTransformerTest extends AbstractContentTransformerTes
|
|||||||
// The Zip transformer produces names of the entries, not their contents.
|
// The Zip transformer produces names of the entries, not their contents.
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testRecursing() throws Exception
|
||||||
|
{
|
||||||
|
ContentWriter writer;
|
||||||
|
String contents;
|
||||||
|
|
||||||
|
// Bean off, no options
|
||||||
|
transformer.setIncludeContents("FALSE");
|
||||||
|
|
||||||
|
writer = getTestWriter();
|
||||||
|
transformer.transform(getTestReader(), writer);
|
||||||
|
contents = writer.getReader().getContentString();
|
||||||
|
testHasFiles(contents);
|
||||||
|
testNested(contents, false);
|
||||||
|
|
||||||
|
|
||||||
|
// Bean on, no options
|
||||||
|
transformer.setIncludeContents("TRUE");
|
||||||
|
|
||||||
|
writer = getTestWriter();
|
||||||
|
transformer.transform(getTestReader(), writer);
|
||||||
|
contents = writer.getReader().getContentString();
|
||||||
|
testHasFiles(contents);
|
||||||
|
testNested(contents, true);
|
||||||
|
|
||||||
|
|
||||||
|
// Bean off, Transformation Options off
|
||||||
|
TransformationOptions options = new TransformationOptions();
|
||||||
|
transformer.setIncludeContents("FALSE");
|
||||||
|
|
||||||
|
writer = getTestWriter();
|
||||||
|
transformer.transform(getTestReader(), writer, options);
|
||||||
|
contents = writer.getReader().getContentString();
|
||||||
|
testHasFiles(contents);
|
||||||
|
testNested(contents, false);
|
||||||
|
|
||||||
|
|
||||||
|
// Bean on, Transformation Options off
|
||||||
|
transformer.setIncludeContents("TRUE");
|
||||||
|
|
||||||
|
writer = getTestWriter();
|
||||||
|
transformer.transform(getTestReader(), writer, options);
|
||||||
|
contents = writer.getReader().getContentString();
|
||||||
|
testHasFiles(contents);
|
||||||
|
testNested(contents, true);
|
||||||
|
|
||||||
|
|
||||||
|
// Bean off, Transformation Options on - options win
|
||||||
|
options.setIncludeEmbedded(true);
|
||||||
|
transformer.setIncludeContents("FALSE");
|
||||||
|
|
||||||
|
writer = getTestWriter();
|
||||||
|
transformer.transform(getTestReader(), writer, options);
|
||||||
|
contents = writer.getReader().getContentString();
|
||||||
|
testHasFiles(contents);
|
||||||
|
testNested(contents, true);
|
||||||
|
|
||||||
|
|
||||||
|
// Bean on, Transformation Options on
|
||||||
|
transformer.setIncludeContents("TRUE");
|
||||||
|
|
||||||
|
writer = getTestWriter();
|
||||||
|
transformer.transform(getTestReader(), writer, options);
|
||||||
|
contents = writer.getReader().getContentString();
|
||||||
|
testHasFiles(contents);
|
||||||
|
testNested(contents, true);
|
||||||
|
}
|
||||||
|
private ContentReader getTestReader() throws IOException {
|
||||||
|
ContentReader sourceReader = new FileContentReader(
|
||||||
|
loadQuickTestFile("zip")
|
||||||
|
);
|
||||||
|
sourceReader.setMimetype(MimetypeMap.MIMETYPE_ZIP);
|
||||||
|
return sourceReader;
|
||||||
|
}
|
||||||
|
private ContentWriter getTestWriter() throws IOException {
|
||||||
|
ContentWriter writer = new FileContentWriter(File.createTempFile("test", ".txt"));
|
||||||
|
writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||||
|
return writer;
|
||||||
|
}
|
||||||
|
private void testHasFiles(String contents)
|
||||||
|
{
|
||||||
|
assertTrue("Files not found in " + contents,
|
||||||
|
contents.contains("quick.txt"));
|
||||||
|
assertTrue("Files not found in " + contents,
|
||||||
|
contents.contains("quick.doc"));
|
||||||
|
assertTrue("Files not found in " + contents,
|
||||||
|
contents.contains("subfolder/quick.jpg"));
|
||||||
|
}
|
||||||
|
private void testNested(String contents, boolean shouldHaveRecursed)
|
||||||
|
{
|
||||||
|
assertEquals(
|
||||||
|
"Recursion was " + shouldHaveRecursed +
|
||||||
|
" but content was " + contents,
|
||||||
|
shouldHaveRecursed,
|
||||||
|
contents.contains("The quick brown fox jumps over the lazy dog")
|
||||||
|
);
|
||||||
|
assertEquals(
|
||||||
|
"Recursion was " + shouldHaveRecursed +
|
||||||
|
" but content was " + contents,
|
||||||
|
shouldHaveRecursed,
|
||||||
|
contents.contains("Le renard brun rapide saute par-dessus le chien paresseux")
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,8 +19,13 @@
|
|||||||
package org.alfresco.repo.content.transform;
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import javax.faces.el.MethodNotFoundException;
|
||||||
|
|
||||||
import org.alfresco.error.AlfrescoRuntimeException;
|
import org.alfresco.error.AlfrescoRuntimeException;
|
||||||
import org.alfresco.repo.content.filestore.FileContentWriter;
|
import org.alfresco.repo.content.filestore.FileContentWriter;
|
||||||
@ -28,6 +33,10 @@ import org.alfresco.service.cmr.repository.ContentReader;
|
|||||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||||
import org.alfresco.util.TempFileProvider;
|
import org.alfresco.util.TempFileProvider;
|
||||||
|
import org.apache.commons.beanutils.PropertyUtils;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.springframework.beans.BeanUtils;
|
||||||
import org.springframework.beans.factory.InitializingBean;
|
import org.springframework.beans.factory.InitializingBean;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -38,8 +47,14 @@ import org.springframework.beans.factory.InitializingBean;
|
|||||||
*/
|
*/
|
||||||
public class ComplexContentTransformer extends AbstractContentTransformer2 implements InitializingBean
|
public class ComplexContentTransformer extends AbstractContentTransformer2 implements InitializingBean
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* The logger
|
||||||
|
*/
|
||||||
|
private static Log logger = LogFactory.getLog(ComplexContentTransformer.class);
|
||||||
|
|
||||||
private List<ContentTransformer> transformers;
|
private List<ContentTransformer> transformers;
|
||||||
private List<String> intermediateMimetypes;
|
private List<String> intermediateMimetypes;
|
||||||
|
private Map<String,Serializable> transformationOptionOverrides;
|
||||||
|
|
||||||
public ComplexContentTransformer()
|
public ComplexContentTransformer()
|
||||||
{
|
{
|
||||||
@ -73,6 +88,20 @@ public class ComplexContentTransformer extends AbstractContentTransformer2 imple
|
|||||||
this.intermediateMimetypes = intermediateMimetypes;
|
this.intermediateMimetypes = intermediateMimetypes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets any properties to be set on the TransformationOption as passed in.
|
||||||
|
* This allows you to force certain properties to always be set on it,
|
||||||
|
* to control the transformers in a different way to their default.
|
||||||
|
* Note that only properties that are supported by the passed-in
|
||||||
|
* {@link TransformationOptions} are changed, others are ignored.
|
||||||
|
* @param transformationOptionOverrides
|
||||||
|
*/
|
||||||
|
public void setTransformationOptionOverrides(
|
||||||
|
Map<String, Serializable> transformationOptionOverrides)
|
||||||
|
{
|
||||||
|
this.transformationOptionOverrides = transformationOptionOverrides;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Ensures that required properties have been set
|
* Ensures that required properties have been set
|
||||||
*/
|
*/
|
||||||
@ -103,6 +132,29 @@ public class ComplexContentTransformer extends AbstractContentTransformer2 imple
|
|||||||
boolean result = true;
|
boolean result = true;
|
||||||
String currentSourceMimetype = sourceMimetype;
|
String currentSourceMimetype = sourceMimetype;
|
||||||
|
|
||||||
|
// Set any transformation options overrides if we can
|
||||||
|
if(options != null && transformationOptionOverrides != null)
|
||||||
|
{
|
||||||
|
for(String key : transformationOptionOverrides.keySet())
|
||||||
|
{
|
||||||
|
if(PropertyUtils.isWriteable(options, key))
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
PropertyUtils.setProperty(options, key, transformationOptionOverrides.get(key));
|
||||||
|
}
|
||||||
|
catch(MethodNotFoundException mnfe) {}
|
||||||
|
catch(NoSuchMethodException nsme) {}
|
||||||
|
catch(InvocationTargetException ite) {}
|
||||||
|
catch(IllegalAccessException iae) {}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
logger.warn("Unable to set override Transformation Option " + key + " on " + options);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Iterator<ContentTransformer> transformerIterator = transformers.iterator();
|
Iterator<ContentTransformer> transformerIterator = transformers.iterator();
|
||||||
Iterator<String> intermediateMimetypeIterator = intermediateMimetypes.iterator();
|
Iterator<String> intermediateMimetypeIterator = intermediateMimetypes.iterator();
|
||||||
while (transformerIterator.hasNext())
|
while (transformerIterator.hasNext())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user