Merged HEAD-BUG-FIX (4.3/Cloud) to HEAD (4.3/Cloud)

68525: Merged V4.2-BUG-FIX (4.2.3) to HEAD-BUG-FIX (4.3/Cloud)
      68281: MNT-11350: Upgrade of Tika to 1.6-SNAPSHOT
         - Upgraded patched Tika, patched PDFBox, and other dependencies
         - Added a MediaTypeDisablingDocumentSelector to disable parsing of certain embedded attachments
         - Added TikaPoweredMetadataExtracter.buildParseContext method which sets the context's DocumentSelector if specified
         - Added setting of DocumentSelector if specified to TikaPoweredContentTransformer
         - Added MediaTypeDisablingDocumentSelector to the transformer.PdfBox bean config which disables parsing of embedded images
         - Updated PdfBoxContentTransformerTest to disable the parsing of embedded images


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@70407 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Alan Davis
2014-05-16 16:24:02 +00:00
parent c02f02bf69
commit b78998bd06
6 changed files with 152 additions and 6 deletions

View File

@@ -39,6 +39,7 @@ import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.embedder.Embedder;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -94,6 +95,7 @@ public abstract class TikaPoweredMetadataExtracter
private DateTimeFormatter tikaUTCDateFormater;
private DateTimeFormatter tikaDateFormater;
protected DocumentSelector documentSelector;
/**
* Builds up a list of supported mime types by merging
@@ -267,6 +269,46 @@ public abstract class TikaPoweredMetadataExtracter
}
}
/**
* Sets the document selector, used for determining whether to parse embedded resources.
*
* @param documentSelector
*/
public void setDocumentSelector(DocumentSelector documentSelector)
{
this.documentSelector = documentSelector;
}
/**
* Gets the document selector, used for determining whether to parse embedded resources,
* null by default so parse all.
*
* @param metadata
* @param sourceMimeType
* @return the document selector
*/
protected DocumentSelector getDocumentSelector(Metadata metadata, String targetMimeType)
{
return documentSelector;
}
/**
* By default returns a new ParseContent
*
* @param metadata
* @param sourceMimeType
* @return the parse context
*/
protected ParseContext buildParseContext(Metadata metadata, String sourceMimeType)
{
ParseContext context = new ParseContext();
DocumentSelector selector = getDocumentSelector(metadata, sourceMimeType);
if (selector != null)
{
context.set(DocumentSelector.class, selector);
}
return context;
}
@SuppressWarnings("deprecation")
@Override
protected Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
@@ -278,11 +320,12 @@ public abstract class TikaPoweredMetadataExtracter
{
is = getInputStream(reader);
Parser parser = getParser();
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, reader.getMimetype());
ParseContext context = buildParseContext(metadata, reader.getMimetype());
ContentHandler handler;
Map<String,String> headers = null;
if(needHeaderContents())