diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml
index b16ac538fc..92e04b9560 100644
--- a/config/alfresco/content-services-context.xml
+++ b/config/alfresco/content-services-context.xml
@@ -462,6 +462,17 @@
+
+
+
+
+ image/jpeg
+ image/tiff
+ image/png
+
+
+
+
org.apache.pdfbox
pdfbox
- 1.8.2-alfresco-patched
+ 1.8.4-alfresco-patched
org.apache.pdfbox
fontbox
- 1.8.2
+ 1.8.4
org.apache.pdfbox
jempbox
- 1.8.2
+ 1.8.4
org.bouncycastle
@@ -511,7 +511,7 @@
org.apache.poi
poi
- 3.9-beta1-20121129-alfresco-patched
+ ${dependency.poi.version}
org.apache.poi
diff --git a/source/java/org/alfresco/repo/content/metadata/MediaTypeDisablingDocumentSelector.java b/source/java/org/alfresco/repo/content/metadata/MediaTypeDisablingDocumentSelector.java
new file mode 100644
index 0000000000..3d9cb4d431
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/metadata/MediaTypeDisablingDocumentSelector.java
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2005-2014 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see .
+ */
+package org.alfresco.repo.content.metadata;
+
+import java.util.List;
+
+import org.apache.tika.extractor.DocumentSelector;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Tika 1.6 has the ability to parse embedded artifacts, such as images in a PDF,
+ * but this can be very resource intensive so adding this selector
+ * to parsers and transformers that handle formats with embedded artifacts
+ * will disable parsing of the specified content types.
+ */
+public class MediaTypeDisablingDocumentSelector implements DocumentSelector
+{
+ private List disabledMediaTypes;
+
+ public void setDisabledMediaTypes(List disabledMediaTypes)
+ {
+ this.disabledMediaTypes = disabledMediaTypes;
+ }
+
+ @Override
+ public boolean select(Metadata metadata)
+ {
+ String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ if (contentType == null || contentType.equals("") || disabledMediaTypes == null)
+ {
+ return true;
+ }
+ return !disabledMediaTypes.contains(contentType);
+ }
+}
diff --git a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java
index 03ffd7d0e3..3bac37f58c 100644
--- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java
@@ -39,6 +39,7 @@ import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.embedder.Embedder;
+import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -94,6 +95,7 @@ public abstract class TikaPoweredMetadataExtracter
private DateTimeFormatter tikaUTCDateFormater;
private DateTimeFormatter tikaDateFormater;
+ protected DocumentSelector documentSelector;
/**
* Builds up a list of supported mime types by merging
@@ -267,6 +269,46 @@ public abstract class TikaPoweredMetadataExtracter
}
}
+ /**
+ * Sets the document selector, used for determining whether to parse embedded resources.
+ *
+ * @param documentSelector
+ */
+ public void setDocumentSelector(DocumentSelector documentSelector)
+ {
+ this.documentSelector = documentSelector;
+ }
+ /**
+ * Gets the document selector, used for determining whether to parse embedded resources,
+ * null by default so parse all.
+ *
+ * @param metadata
+ * @param sourceMimeType
+ * @return the document selector
+ */
+ protected DocumentSelector getDocumentSelector(Metadata metadata, String targetMimeType)
+ {
+ return documentSelector;
+ }
+
+ /**
+ * By default returns a new ParseContent
+ *
+ * @param metadata
+ * @param sourceMimeType
+ * @return the parse context
+ */
+ protected ParseContext buildParseContext(Metadata metadata, String sourceMimeType)
+ {
+ ParseContext context = new ParseContext();
+ DocumentSelector selector = getDocumentSelector(metadata, sourceMimeType);
+ if (selector != null)
+ {
+ context.set(DocumentSelector.class, selector);
+ }
+ return context;
+ }
+
@SuppressWarnings("deprecation")
@Override
protected Map extractRaw(ContentReader reader) throws Throwable
@@ -278,11 +320,12 @@ public abstract class TikaPoweredMetadataExtracter
{
is = getInputStream(reader);
Parser parser = getParser();
- ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, reader.getMimetype());
+ ParseContext context = buildParseContext(metadata, reader.getMimetype());
+
ContentHandler handler;
Map headers = null;
if(needHeaderContents())
diff --git a/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java
index 23e1102040..503224d791 100644
--- a/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java
@@ -38,6 +38,7 @@ import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
@@ -69,6 +70,7 @@ public abstract class TikaPoweredContentTransformer extends AbstractContentTrans
MimetypeMap.MIMETYPE_XML});
protected List sourceMimeTypes;
+ protected DocumentSelector documentSelector;
/**
* Windows carriage return line feed pair.
@@ -163,13 +165,42 @@ public abstract class TikaPoweredContentTransformer extends AbstractContentTrans
}
return handler;
}
+
+ /**
+ * Sets the document selector, used for determining whether to parse embedded resources.
+ *
+ * @param documentSelector
+ */
+ public void setDocumentSelector(DocumentSelector documentSelector)
+ {
+ this.documentSelector = documentSelector;
+ }
+ /**
+ * Gets the document selector, used for determining whether to parse embedded resources,
+ * null by default so parse all.
+ *
+ * @param metadata
+ * @param targetMimeType
+ * @param options
+ * @return the document selector
+ */
+ protected DocumentSelector getDocumentSelector(Metadata metadata, String targetMimeType, TransformationOptions options)
+ {
+ return documentSelector;
+ }
/**
* By default returns a ParseContent that does not recurse
*/
protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options)
{
- return new ParseContext();
+ ParseContext context = new ParseContext();
+ DocumentSelector selector = getDocumentSelector(metadata, targetMimeType, options);
+ if (selector != null)
+ {
+ context.set(DocumentSelector.class, selector);
+ }
+ return context;
}
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
diff --git a/source/test-java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java b/source/test-java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java
index e725326739..ee9aca3aa7 100644
--- a/source/test-java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java
+++ b/source/test-java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java
@@ -20,6 +20,7 @@ package org.alfresco.repo.content.transform;
import java.io.File;
import java.io.IOException;
+import java.util.Arrays;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@@ -28,6 +29,7 @@ import java.util.concurrent.TimeUnit;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.filestore.FileContentReader;
import org.alfresco.repo.content.filestore.FileContentWriter;
+import org.alfresco.repo.content.metadata.MediaTypeDisablingDocumentSelector;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
@@ -51,6 +53,14 @@ public class PdfBoxContentTransformerTest extends AbstractContentTransformerTest
transformer.setMimetypeService(mimetypeService);
transformer.setTransformerDebug(transformerDebug);
transformer.setTransformerConfig(transformerConfig);
+
+ // Disable parsing of embedded images
+ MediaTypeDisablingDocumentSelector selector = new MediaTypeDisablingDocumentSelector();
+ selector.setDisabledMediaTypes(Arrays.asList(
+ MimetypeMap.MIMETYPE_IMAGE_JPEG,
+ MimetypeMap.MIMETYPE_IMAGE_TIFF,
+ MimetypeMap.MIMETYPE_IMAGE_PNG));
+ transformer.setDocumentSelector(selector);
}
/**