mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Merged HEAD-BUG-FIX (4.3/Cloud) to HEAD (4.3/Cloud)
68525: Merged V4.2-BUG-FIX (4.2.3) to HEAD-BUG-FIX (4.3/Cloud) 68281: MNT-11350: Upgrade of Tika to 1.6-SNAPSHOT - Upgraded patched Tika, patched PDFBox, and other dependencies - Added a MediaTypeDisablingDocumentSelector to disable parsing of certain embedded attachments - Added TikaPoweredMetadataExtracter.buildParseContext method which sets the context's DocumentSelector if specified - Added setting of DocumentSelector if specified to TikaPoweredContentTransformer - Added MediaTypeDisablingDocumentSelector to the transformer.PdfBox bean config which disables parsing of embedded images - Updated PdfBoxContentTransformerTest to disable the parsing of embedded images git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@70407 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -39,6 +39,7 @@ import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tika.embedder.Embedder;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.io.TemporaryResources;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
@@ -94,6 +95,7 @@ public abstract class TikaPoweredMetadataExtracter
|
||||
|
||||
private DateTimeFormatter tikaUTCDateFormater;
|
||||
private DateTimeFormatter tikaDateFormater;
|
||||
protected DocumentSelector documentSelector;
|
||||
|
||||
/**
|
||||
* Builds up a list of supported mime types by merging
|
||||
@@ -267,6 +269,46 @@ public abstract class TikaPoweredMetadataExtracter
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the document selector, used for determining whether to parse embedded resources.
|
||||
*
|
||||
* @param documentSelector
|
||||
*/
|
||||
public void setDocumentSelector(DocumentSelector documentSelector)
|
||||
{
|
||||
this.documentSelector = documentSelector;
|
||||
}
|
||||
/**
|
||||
* Gets the document selector, used for determining whether to parse embedded resources,
|
||||
* null by default so parse all.
|
||||
*
|
||||
* @param metadata
|
||||
* @param sourceMimeType
|
||||
* @return the document selector
|
||||
*/
|
||||
protected DocumentSelector getDocumentSelector(Metadata metadata, String targetMimeType)
|
||||
{
|
||||
return documentSelector;
|
||||
}
|
||||
|
||||
/**
|
||||
* By default returns a new ParseContent
|
||||
*
|
||||
* @param metadata
|
||||
* @param sourceMimeType
|
||||
* @return the parse context
|
||||
*/
|
||||
protected ParseContext buildParseContext(Metadata metadata, String sourceMimeType)
|
||||
{
|
||||
ParseContext context = new ParseContext();
|
||||
DocumentSelector selector = getDocumentSelector(metadata, sourceMimeType);
|
||||
if (selector != null)
|
||||
{
|
||||
context.set(DocumentSelector.class, selector);
|
||||
}
|
||||
return context;
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
@Override
|
||||
protected Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
||||
@@ -278,11 +320,12 @@ public abstract class TikaPoweredMetadataExtracter
|
||||
{
|
||||
is = getInputStream(reader);
|
||||
Parser parser = getParser();
|
||||
ParseContext context = new ParseContext();
|
||||
|
||||
Metadata metadata = new Metadata();
|
||||
metadata.add(Metadata.CONTENT_TYPE, reader.getMimetype());
|
||||
|
||||
ParseContext context = buildParseContext(metadata, reader.getMimetype());
|
||||
|
||||
ContentHandler handler;
|
||||
Map<String,String> headers = null;
|
||||
if(needHeaderContents())
|
||||
|
Reference in New Issue
Block a user