diff --git a/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java b/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java index 72ecef9308..08e3b4df12 100644 --- a/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java @@ -19,8 +19,12 @@ package org.alfresco.repo.content.transform; import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.cmr.repository.TransformationOptions; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.pdf.PDFParser; +import org.apache.tika.parser.pdf.PDFParserConfig; /** * Uses {@link http://tika.apache.org/ Apache Tika} and @@ -32,6 +36,8 @@ import org.apache.tika.parser.pdf.PDFParser; */ public class PdfBoxContentTransformer extends TikaPoweredContentTransformer { + protected PDFParserConfig pdfParserConfig; + public PdfBoxContentTransformer() { super(new String[] { MimetypeMap.MIMETYPE_PDF @@ -42,4 +48,28 @@ public class PdfBoxContentTransformer extends TikaPoweredContentTransformer protected Parser getParser() { return new PDFParser(); } + + /** + * Sets the PDFParserConfig for inclusion in the ParseContext sent to the PDFBox parser, + * useful for setting config like spacingTolerance. + * + * @param pdfParserConfig + */ + public void setPdfParserConfig(PDFParserConfig pdfParserConfig) + { + this.pdfParserConfig = pdfParserConfig; + } + + @Override + protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options) + { + ParseContext context = super.buildParseContext(metadata, targetMimeType, options); + if (pdfParserConfig != null) + { + System.out.println("**** spacingTolerance=" + pdfParserConfig.getSpacingTolerance() + ", averageCharTolerance=" + pdfParserConfig.getAverageCharTolerance()); + context.set(PDFParserConfig.class, pdfParserConfig); + } + // TODO: Possibly extend TransformationOptions to allow for per-transform PDFParserConfig? + return context; + } }