From cf0089b9b7f85c9a15e3a64a41102a5f6bb35f7a Mon Sep 17 00:00:00 2001 From: Alan Davis Date: Fri, 16 May 2014 16:39:52 +0000 Subject: [PATCH] Merged HEAD-BUG-FIX (4.3/Cloud) to HEAD (4.3/Cloud) 68540: Merged V4.2-BUG-FIX (4.2.3) to HEAD-BUG-FIX (4.3/Cloud) 68479: MNT-11225: Problem with pdf-text extraction, spaces between letters in many of the words - Added PdfParserConfig config field which is added to the ParseContext if present git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@70422 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- .../transform/PdfBoxContentTransformer.java | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java b/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java index 72ecef9308..08e3b4df12 100644 --- a/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java @@ -19,8 +19,12 @@ package org.alfresco.repo.content.transform; import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.cmr.repository.TransformationOptions; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.pdf.PDFParser; +import org.apache.tika.parser.pdf.PDFParserConfig; /** * Uses {@link http://tika.apache.org/ Apache Tika} and @@ -32,6 +36,8 @@ import org.apache.tika.parser.pdf.PDFParser; */ public class PdfBoxContentTransformer extends TikaPoweredContentTransformer { + protected PDFParserConfig pdfParserConfig; + public PdfBoxContentTransformer() { super(new String[] { MimetypeMap.MIMETYPE_PDF @@ -42,4 +48,28 @@ public class PdfBoxContentTransformer extends TikaPoweredContentTransformer protected Parser getParser() { return new PDFParser(); } + + /** + * Sets the PDFParserConfig for inclusion in the ParseContext sent to the PDFBox parser, + * useful for setting config like spacingTolerance. + * + * @param pdfParserConfig + */ + public void setPdfParserConfig(PDFParserConfig pdfParserConfig) + { + this.pdfParserConfig = pdfParserConfig; + } + + @Override + protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options) + { + ParseContext context = super.buildParseContext(metadata, targetMimeType, options); + if (pdfParserConfig != null) + { + System.out.println("**** spacingTolerance=" + pdfParserConfig.getSpacingTolerance() + ", averageCharTolerance=" + pdfParserConfig.getAverageCharTolerance()); + context.set(PDFParserConfig.class, pdfParserConfig); + } + // TODO: Possibly extend TransformationOptions to allow for per-transform PDFParserConfig? + return context; + } }