mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Merged HEAD-BUG-FIX (4.3/Cloud) to HEAD (4.3/Cloud)
68540: Merged V4.2-BUG-FIX (4.2.3) to HEAD-BUG-FIX (4.3/Cloud) 68479: MNT-11225: Problem with pdf-text extraction, spaces between letters in many of the words - Added PdfParserConfig config field which is added to the ParseContext if present git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@70422 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -19,8 +19,12 @@
|
|||||||
package org.alfresco.repo.content.transform;
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
import org.alfresco.repo.content.MimetypeMap;
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
|
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||||
|
import org.apache.tika.metadata.Metadata;
|
||||||
|
import org.apache.tika.parser.ParseContext;
|
||||||
import org.apache.tika.parser.Parser;
|
import org.apache.tika.parser.Parser;
|
||||||
import org.apache.tika.parser.pdf.PDFParser;
|
import org.apache.tika.parser.pdf.PDFParser;
|
||||||
|
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Uses {@link http://tika.apache.org/ Apache Tika} and
|
* Uses {@link http://tika.apache.org/ Apache Tika} and
|
||||||
@@ -32,6 +36,8 @@ import org.apache.tika.parser.pdf.PDFParser;
|
|||||||
*/
|
*/
|
||||||
public class PdfBoxContentTransformer extends TikaPoweredContentTransformer
|
public class PdfBoxContentTransformer extends TikaPoweredContentTransformer
|
||||||
{
|
{
|
||||||
|
protected PDFParserConfig pdfParserConfig;
|
||||||
|
|
||||||
public PdfBoxContentTransformer() {
|
public PdfBoxContentTransformer() {
|
||||||
super(new String[] {
|
super(new String[] {
|
||||||
MimetypeMap.MIMETYPE_PDF
|
MimetypeMap.MIMETYPE_PDF
|
||||||
@@ -42,4 +48,28 @@ public class PdfBoxContentTransformer extends TikaPoweredContentTransformer
|
|||||||
protected Parser getParser() {
|
protected Parser getParser() {
|
||||||
return new PDFParser();
|
return new PDFParser();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the PDFParserConfig for inclusion in the ParseContext sent to the PDFBox parser,
|
||||||
|
* useful for setting config like spacingTolerance.
|
||||||
|
*
|
||||||
|
* @param pdfParserConfig
|
||||||
|
*/
|
||||||
|
public void setPdfParserConfig(PDFParserConfig pdfParserConfig)
|
||||||
|
{
|
||||||
|
this.pdfParserConfig = pdfParserConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options)
|
||||||
|
{
|
||||||
|
ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
|
||||||
|
if (pdfParserConfig != null)
|
||||||
|
{
|
||||||
|
System.out.println("**** spacingTolerance=" + pdfParserConfig.getSpacingTolerance() + ", averageCharTolerance=" + pdfParserConfig.getAverageCharTolerance());
|
||||||
|
context.set(PDFParserConfig.class, pdfParserConfig);
|
||||||
|
}
|
||||||
|
// TODO: Possibly extend TransformationOptions to allow for per-transform PDFParserConfig?
|
||||||
|
return context;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user