diff --git a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java index a7091002..c97993cb 100644 --- a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java +++ b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java @@ -22,6 +22,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; import org.apache.tika.parser.pdf.PDFParser; +import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.parser.pkg.PackageParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ExpandedTitleContentHandler; @@ -429,6 +430,7 @@ public class Tika public static final String TARGET_MIMETYPE = "--targetMimetype="; public static final String TARGET_ENCODING = "--targetEncoding="; public static final String INCLUDE_CONTENTS = "--includeContents"; + public static final String NOT_EXTRACT_BOOKMARKS_TEXT = "--notExtractBookmarksText"; public static final String CSV = "csv"; public static final String DOC = "doc"; @@ -449,6 +451,7 @@ public class Tika private Parser autoDetectParser; private Parser ooXmlParser = new OOXMLParser(); private Parser tikaOfficeDetectParser = new TikaOfficeDetectParser(); + private PDFParserConfig pdfParserConfig = new PDFParserConfig(); private DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector() { @@ -505,6 +508,7 @@ public class Tika String sourceFilename = null; String targetFilename = null; Boolean includeContents = null; + Boolean notExtractBookmarksText = null; for (String arg: args) { @@ -523,6 +527,11 @@ public class Tika { targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE); } + else if (arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT)) + { + getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT); + notExtractBookmarksText = true; + } else { throw new IllegalArgumentException("Unexpected argument "+arg); @@ -553,8 +562,9 @@ public class Tika throw new IllegalArgumentException("Missing arguments"); } includeContents = includeContents == null ? false : includeContents; + notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText; - transform(transform, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding); + transform(transform, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding); } private String getValue(String arg, boolean valueExpected, Object value, String optionName) @@ -577,6 +587,7 @@ public class Tika // Adds transform specific values such as parser and documentSelector. private void transform(String transform, Boolean includeContents, + Boolean notExtractBookmarksText, String sourceFilename, String targetFilename, String targetMimetype, String targetEncoding) { @@ -608,11 +619,12 @@ public class Tika break; } - transform(parser, documentSelector, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding); + transform(parser, documentSelector, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding); } private void transform(Parser parser, DocumentSelector documentSelector, Boolean includeContents, + Boolean notExtractBookmarksText, String sourceFilename, String targetFilename, String targetMimetype, String targetEncoding) { @@ -626,7 +638,7 @@ public class Tika os = new FileOutputStream(targetFilename); ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding)); Metadata metadata = new Metadata(); - ParseContext context = buildParseContext(documentSelector, includeContents); + ParseContext context = buildParseContext(documentSelector, includeContents, notExtractBookmarksText); ContentHandler handler = getContentHandler(targetMimetype, ow); parser.parse(is, handler, metadata, context); @@ -780,15 +792,21 @@ public class Tika } } - protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents) + protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents, Boolean notExtractBookmarksText) { ParseContext context = new ParseContext(); + if (documentSelector != null) { context.set(DocumentSelector.class, documentSelector); } - // pdfParserConfig is never set in the original repo code, so code removed here. + if (notExtractBookmarksText.equals(true)) + { + pdfParserConfig.setExtractBookmarksText(false); + // pdfParserConfig is set to override default settings + context.set(PDFParserConfig.class, pdfParserConfig); + } // If Archive transform if (includeContents != null) @@ -798,4 +816,5 @@ public class Tika return context; } + } diff --git a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaController.java b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaController.java index 8c365f0c..6c23e9d6 100644 --- a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaController.java +++ b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaController.java @@ -113,7 +113,9 @@ public class TikaController extends AbstractTransformerController @RequestParam(value = "testDelay", required = false) Long testDelay, @RequestParam(value = "transform") String transform, - @RequestParam(value="includeContents", required = false) Boolean includeContents) + @RequestParam(value="includeContents", required = false) Boolean includeContents, + @RequestParam(value="notExtractBookmarksText", required = false) Boolean notExtractBookmarksText) + { if (!TRANSFORM_NAMES.contains(transform)) { @@ -130,6 +132,7 @@ public class TikaController extends AbstractTransformerController callTransform(sourceFile, targetFile, transform, includeContents != null && includeContents ? INCLUDE_CONTENTS : null, + notExtractBookmarksText != null && notExtractBookmarksText ? NOT_EXTRACT_BOOKMARKS_TEXT: null, TARGET_MIMETYPE+targetMimetype, TARGET_ENCODING+targetEncoding); return createAttachment(targetFilename, targetFile, testDelay); diff --git a/alfresco-docker-tika/src/main/resources/templates/transformForm.html b/alfresco-docker-tika/src/main/resources/templates/transformForm.html index 5e230dbf..9171491e 100644 --- a/alfresco-docker-tika/src/main/resources/templates/transformForm.html +++ b/alfresco-docker-tika/src/main/resources/templates/transformForm.html @@ -25,8 +25,8 @@