mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Merged HEAD-BUG-FIX (4.3/Cloud) to HEAD (4.3/Cloud)
68540: Merged V4.2-BUG-FIX (4.2.3) to HEAD-BUG-FIX (4.3/Cloud) 68479: MNT-11225: Problem with pdf-text extraction, spaces between letters in many of the words - Added PdfParserConfig config field which is added to the ParseContext if present git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@70422 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -19,8 +19,12 @@
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||
|
||||
/**
|
||||
* Uses {@link http://tika.apache.org/ Apache Tika} and
|
||||
@@ -32,6 +36,8 @@ import org.apache.tika.parser.pdf.PDFParser;
|
||||
*/
|
||||
public class PdfBoxContentTransformer extends TikaPoweredContentTransformer
|
||||
{
|
||||
protected PDFParserConfig pdfParserConfig;
|
||||
|
||||
public PdfBoxContentTransformer() {
|
||||
super(new String[] {
|
||||
MimetypeMap.MIMETYPE_PDF
|
||||
@@ -42,4 +48,28 @@ public class PdfBoxContentTransformer extends TikaPoweredContentTransformer
|
||||
protected Parser getParser() {
|
||||
return new PDFParser();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the PDFParserConfig for inclusion in the ParseContext sent to the PDFBox parser,
|
||||
* useful for setting config like spacingTolerance.
|
||||
*
|
||||
* @param pdfParserConfig
|
||||
*/
|
||||
public void setPdfParserConfig(PDFParserConfig pdfParserConfig)
|
||||
{
|
||||
this.pdfParserConfig = pdfParserConfig;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options)
|
||||
{
|
||||
ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
|
||||
if (pdfParserConfig != null)
|
||||
{
|
||||
System.out.println("**** spacingTolerance=" + pdfParserConfig.getSpacingTolerance() + ", averageCharTolerance=" + pdfParserConfig.getAverageCharTolerance());
|
||||
context.set(PDFParserConfig.class, pdfParserConfig);
|
||||
}
|
||||
// TODO: Possibly extend TransformationOptions to allow for per-transform PDFParserConfig?
|
||||
return context;
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user