diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml index b16ac538fc..92e04b9560 100644 --- a/config/alfresco/content-services-context.xml +++ b/config/alfresco/content-services-context.xml @@ -462,6 +462,17 @@ + + + + + image/jpeg + image/tiff + image/png + + + + org.apache.pdfbox pdfbox - 1.8.2-alfresco-patched + 1.8.4-alfresco-patched org.apache.pdfbox fontbox - 1.8.2 + 1.8.4 org.apache.pdfbox jempbox - 1.8.2 + 1.8.4 org.bouncycastle @@ -511,7 +511,7 @@ org.apache.poi poi - 3.9-beta1-20121129-alfresco-patched + ${dependency.poi.version} org.apache.poi diff --git a/source/java/org/alfresco/repo/content/metadata/MediaTypeDisablingDocumentSelector.java b/source/java/org/alfresco/repo/content/metadata/MediaTypeDisablingDocumentSelector.java new file mode 100644 index 0000000000..3d9cb4d431 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/MediaTypeDisablingDocumentSelector.java @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2005-2014 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.metadata; + +import java.util.List; + +import org.apache.tika.extractor.DocumentSelector; +import org.apache.tika.metadata.Metadata; + +/** + * Tika 1.6 has the ability to parse embedded artifacts, such as images in a PDF, + * but this can be very resource intensive so adding this selector + * to parsers and transformers that handle formats with embedded artifacts + * will disable parsing of the specified content types. + */ +public class MediaTypeDisablingDocumentSelector implements DocumentSelector +{ + private List disabledMediaTypes; + + public void setDisabledMediaTypes(List disabledMediaTypes) + { + this.disabledMediaTypes = disabledMediaTypes; + } + + @Override + public boolean select(Metadata metadata) + { + String contentType = metadata.get(Metadata.CONTENT_TYPE); + if (contentType == null || contentType.equals("") || disabledMediaTypes == null) + { + return true; + } + return !disabledMediaTypes.contains(contentType); + } +} diff --git a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java index 03ffd7d0e3..3bac37f58c 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java @@ -39,6 +39,7 @@ import org.alfresco.service.cmr.repository.datatype.TypeConversionException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tika.embedder.Embedder; +import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -94,6 +95,7 @@ public abstract class TikaPoweredMetadataExtracter private DateTimeFormatter tikaUTCDateFormater; private DateTimeFormatter tikaDateFormater; + protected DocumentSelector documentSelector; /** * Builds up a list of supported mime types by merging @@ -267,6 +269,46 @@ public abstract class TikaPoweredMetadataExtracter } } + /** + * Sets the document selector, used for determining whether to parse embedded resources. + * + * @param documentSelector + */ + public void setDocumentSelector(DocumentSelector documentSelector) + { + this.documentSelector = documentSelector; + } + /** + * Gets the document selector, used for determining whether to parse embedded resources, + * null by default so parse all. + * + * @param metadata + * @param sourceMimeType + * @return the document selector + */ + protected DocumentSelector getDocumentSelector(Metadata metadata, String targetMimeType) + { + return documentSelector; + } + + /** + * By default returns a new ParseContent + * + * @param metadata + * @param sourceMimeType + * @return the parse context + */ + protected ParseContext buildParseContext(Metadata metadata, String sourceMimeType) + { + ParseContext context = new ParseContext(); + DocumentSelector selector = getDocumentSelector(metadata, sourceMimeType); + if (selector != null) + { + context.set(DocumentSelector.class, selector); + } + return context; + } + @SuppressWarnings("deprecation") @Override protected Map extractRaw(ContentReader reader) throws Throwable @@ -278,11 +320,12 @@ public abstract class TikaPoweredMetadataExtracter { is = getInputStream(reader); Parser parser = getParser(); - ParseContext context = new ParseContext(); Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, reader.getMimetype()); + ParseContext context = buildParseContext(metadata, reader.getMimetype()); + ContentHandler handler; Map headers = null; if(needHeaderContents()) diff --git a/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java index 23e1102040..503224d791 100644 --- a/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java @@ -38,6 +38,7 @@ import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.TransformationOptions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -69,6 +70,7 @@ public abstract class TikaPoweredContentTransformer extends AbstractContentTrans MimetypeMap.MIMETYPE_XML}); protected List sourceMimeTypes; + protected DocumentSelector documentSelector; /** * Windows carriage return line feed pair. @@ -163,13 +165,42 @@ public abstract class TikaPoweredContentTransformer extends AbstractContentTrans } return handler; } + + /** + * Sets the document selector, used for determining whether to parse embedded resources. + * + * @param documentSelector + */ + public void setDocumentSelector(DocumentSelector documentSelector) + { + this.documentSelector = documentSelector; + } + /** + * Gets the document selector, used for determining whether to parse embedded resources, + * null by default so parse all. + * + * @param metadata + * @param targetMimeType + * @param options + * @return the document selector + */ + protected DocumentSelector getDocumentSelector(Metadata metadata, String targetMimeType, TransformationOptions options) + { + return documentSelector; + } /** * By default returns a ParseContent that does not recurse */ protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options) { - return new ParseContext(); + ParseContext context = new ParseContext(); + DocumentSelector selector = getDocumentSelector(metadata, targetMimeType, options); + if (selector != null) + { + context.set(DocumentSelector.class, selector); + } + return context; } public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options) diff --git a/source/test-java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java b/source/test-java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java index e725326739..ee9aca3aa7 100644 --- a/source/test-java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java +++ b/source/test-java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java @@ -20,6 +20,7 @@ package org.alfresco.repo.content.transform; import java.io.File; import java.io.IOException; +import java.util.Arrays; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -28,6 +29,7 @@ import java.util.concurrent.TimeUnit; import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.filestore.FileContentReader; import org.alfresco.repo.content.filestore.FileContentWriter; +import org.alfresco.repo.content.metadata.MediaTypeDisablingDocumentSelector; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.TransformationOptions; @@ -51,6 +53,14 @@ public class PdfBoxContentTransformerTest extends AbstractContentTransformerTest transformer.setMimetypeService(mimetypeService); transformer.setTransformerDebug(transformerDebug); transformer.setTransformerConfig(transformerConfig); + + // Disable parsing of embedded images + MediaTypeDisablingDocumentSelector selector = new MediaTypeDisablingDocumentSelector(); + selector.setDisabledMediaTypes(Arrays.asList( + MimetypeMap.MIMETYPE_IMAGE_JPEG, + MimetypeMap.MIMETYPE_IMAGE_TIFF, + MimetypeMap.MIMETYPE_IMAGE_PNG)); + transformer.setDocumentSelector(selector); } /**