Merged HEAD-BUG-FIX (4.3/Cloud) to HEAD (4.3/Cloud)

68540: Merged V4.2-BUG-FIX (4.2.3) to HEAD-BUG-FIX (4.3/Cloud)
      68479: MNT-11225: Problem with pdf-text extraction, spaces between letters in many of the words
         - Added PdfParserConfig config field which is added to the ParseContext if present


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@70422 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Alan Davis
2014-05-16 16:39:52 +00:00
parent 09ac1c52ff
commit cf0089b9b7

View File

@@ -19,8 +19,12 @@
package org.alfresco.repo.content.transform;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
/**
* Uses {@link http://tika.apache.org/ Apache Tika} and
@@ -32,6 +36,8 @@ import org.apache.tika.parser.pdf.PDFParser;
*/
public class PdfBoxContentTransformer extends TikaPoweredContentTransformer
{
protected PDFParserConfig pdfParserConfig;
public PdfBoxContentTransformer() {
super(new String[] {
MimetypeMap.MIMETYPE_PDF
@@ -42,4 +48,28 @@ public class PdfBoxContentTransformer extends TikaPoweredContentTransformer
protected Parser getParser() {
return new PDFParser();
}
/**
* Sets the PDFParserConfig for inclusion in the ParseContext sent to the PDFBox parser,
* useful for setting config like spacingTolerance.
*
* @param pdfParserConfig
*/
public void setPdfParserConfig(PDFParserConfig pdfParserConfig)
{
this.pdfParserConfig = pdfParserConfig;
}
@Override
protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options)
{
ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
if (pdfParserConfig != null)
{
System.out.println("**** spacingTolerance=" + pdfParserConfig.getSpacingTolerance() + ", averageCharTolerance=" + pdfParserConfig.getAverageCharTolerance());
context.set(PDFParserConfig.class, pdfParserConfig);
}
// TODO: Possibly extend TransformationOptions to allow for per-transform PDFParserConfig?
return context;
}
}