From 4d2d4acce7d3b4c1c73db313fb9052617c1e7d64 Mon Sep 17 00:00:00 2001 From: Andreea Nechifor Date: Tue, 24 Jul 2018 10:59:03 +0300 Subject: [PATCH 1/2] REPO-3626: added a new parameter notExtractBookmarksText --- .../java/org/alfresco/transformer/Tika.java | 34 ++++++++++++++++--- .../alfresco/transformer/TikaController.java | 5 ++- .../resources/templates/transformForm.html | 2 +- .../transformer/TikaControllerTest.java | 10 ++++++ 4 files changed, 44 insertions(+), 7 deletions(-) diff --git a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java index a7091002..cd451cce 100644 --- a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java +++ b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java @@ -22,6 +22,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; import org.apache.tika.parser.pdf.PDFParser; +import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.parser.pkg.PackageParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ExpandedTitleContentHandler; @@ -429,6 +430,7 @@ public class Tika public static final String TARGET_MIMETYPE = "--targetMimetype="; public static final String TARGET_ENCODING = "--targetEncoding="; public static final String INCLUDE_CONTENTS = "--includeContents"; + public static final String NOT_EXTRACT_BOOKMARKS_TEXT = "--notExtractBookmarksText"; public static final String CSV = "csv"; public static final String DOC = "doc"; @@ -449,6 +451,7 @@ public class Tika private Parser autoDetectParser; private Parser ooXmlParser = new OOXMLParser(); private Parser tikaOfficeDetectParser = new TikaOfficeDetectParser(); + private PDFParserConfig pdfParserConfig = new PDFParserConfig(); private DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector() { @@ -505,6 +508,7 @@ public class Tika String sourceFilename = null; String targetFilename = null; Boolean includeContents = null; + Boolean notExtractBookmarksText = null; for (String arg: args) { @@ -523,6 +527,11 @@ public class Tika { targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE); } + else if(arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT)) + { + getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT); + notExtractBookmarksText = true; + } else { throw new IllegalArgumentException("Unexpected argument "+arg); @@ -553,8 +562,9 @@ public class Tika throw new IllegalArgumentException("Missing arguments"); } includeContents = includeContents == null ? false : includeContents; + notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText; - transform(transform, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding); + transform(transform, includeContents, notExtractBookmarksText,sourceFilename, targetFilename, targetMimetype, targetEncoding); } private String getValue(String arg, boolean valueExpected, Object value, String optionName) @@ -577,6 +587,7 @@ public class Tika // Adds transform specific values such as parser and documentSelector. private void transform(String transform, Boolean includeContents, + Boolean notExtractBookmarksText, String sourceFilename, String targetFilename, String targetMimetype, String targetEncoding) { @@ -608,11 +619,12 @@ public class Tika break; } - transform(parser, documentSelector, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding); + transform(parser, documentSelector, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding); } private void transform(Parser parser, DocumentSelector documentSelector, Boolean includeContents, + Boolean notExtractBookmarksText, String sourceFilename, String targetFilename, String targetMimetype, String targetEncoding) { @@ -626,7 +638,7 @@ public class Tika os = new FileOutputStream(targetFilename); ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding)); Metadata metadata = new Metadata(); - ParseContext context = buildParseContext(documentSelector, includeContents); + ParseContext context = buildParseContext(documentSelector, includeContents, notExtractBookmarksText); ContentHandler handler = getContentHandler(targetMimetype, ow); parser.parse(is, handler, metadata, context); @@ -780,15 +792,26 @@ public class Tika } } - protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents) + protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents, Boolean notExtractBookmarksText) { ParseContext context = new ParseContext(); + if (documentSelector != null) { context.set(DocumentSelector.class, documentSelector); } - // pdfParserConfig is never set in the original repo code, so code removed here. + if (pdfParserConfig != null) + { + + if (notExtractBookmarksText != null) + { + pdfParserConfig.setExtractBookmarksText(!notExtractBookmarksText); + } + + // pdfParserConfig is set to override default settings + context.set(PDFParserConfig.class, pdfParserConfig); + } // If Archive transform if (includeContents != null) @@ -798,4 +821,5 @@ public class Tika return context; } + } diff --git a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaController.java b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaController.java index 8c365f0c..6c23e9d6 100644 --- a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaController.java +++ b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaController.java @@ -113,7 +113,9 @@ public class TikaController extends AbstractTransformerController @RequestParam(value = "testDelay", required = false) Long testDelay, @RequestParam(value = "transform") String transform, - @RequestParam(value="includeContents", required = false) Boolean includeContents) + @RequestParam(value="includeContents", required = false) Boolean includeContents, + @RequestParam(value="notExtractBookmarksText", required = false) Boolean notExtractBookmarksText) + { if (!TRANSFORM_NAMES.contains(transform)) { @@ -130,6 +132,7 @@ public class TikaController extends AbstractTransformerController callTransform(sourceFile, targetFile, transform, includeContents != null && includeContents ? INCLUDE_CONTENTS : null, + notExtractBookmarksText != null && notExtractBookmarksText ? NOT_EXTRACT_BOOKMARKS_TEXT: null, TARGET_MIMETYPE+targetMimetype, TARGET_ENCODING+targetEncoding); return createAttachment(targetFilename, targetFile, testDelay); diff --git a/alfresco-docker-tika/src/main/resources/templates/transformForm.html b/alfresco-docker-tika/src/main/resources/templates/transformForm.html index 5e230dbf..538e872a 100644 --- a/alfresco-docker-tika/src/main/resources/templates/transformForm.html +++ b/alfresco-docker-tika/src/main/resources/templates/transformForm.html @@ -25,7 +25,7 @@
includeContents (archive) *
timeout
testDelay
- +
notExtractBookmarksText
diff --git a/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaControllerTest.java b/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaControllerTest.java index bbf14145..5349ee89 100644 --- a/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaControllerTest.java +++ b/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaControllerTest.java @@ -341,4 +341,14 @@ public class TikaControllerTest extends AbstractTransformerControllerTest { transform(TEXT_MINING, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS); } + + @Test + public void pdfToTxtExtractBookmarksTest() throws Exception + { + + super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true); + mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension).param("notExtractBookmarksText", "true")) + .andExpect(status().is(200)) + .andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick." + targetExtension)); + } } From a011c2ca39def0c34a77e462a6f33ba9255a0a47 Mon Sep 17 00:00:00 2001 From: Andreea Nechifor Date: Tue, 24 Jul 2018 14:37:40 +0300 Subject: [PATCH 2/2] REPO-3626: changes after review. --- .../java/org/alfresco/transformer/Tika.java | 17 ++++++----------- .../main/resources/templates/transformForm.html | 2 +- .../transformer/TikaControllerTest.java | 1 - 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java index cd451cce..c97993cb 100644 --- a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java +++ b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java @@ -527,10 +527,10 @@ public class Tika { targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE); } - else if(arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT)) + else if (arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT)) { - getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT); - notExtractBookmarksText = true; + getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT); + notExtractBookmarksText = true; } else { @@ -564,7 +564,7 @@ public class Tika includeContents = includeContents == null ? false : includeContents; notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText; - transform(transform, includeContents, notExtractBookmarksText,sourceFilename, targetFilename, targetMimetype, targetEncoding); + transform(transform, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding); } private String getValue(String arg, boolean valueExpected, Object value, String optionName) @@ -801,14 +801,9 @@ public class Tika context.set(DocumentSelector.class, documentSelector); } - if (pdfParserConfig != null) + if (notExtractBookmarksText.equals(true)) { - - if (notExtractBookmarksText != null) - { - pdfParserConfig.setExtractBookmarksText(!notExtractBookmarksText); - } - + pdfParserConfig.setExtractBookmarksText(false); // pdfParserConfig is set to override default settings context.set(PDFParserConfig.class, pdfParserConfig); } diff --git a/alfresco-docker-tika/src/main/resources/templates/transformForm.html b/alfresco-docker-tika/src/main/resources/templates/transformForm.html index 538e872a..9171491e 100644 --- a/alfresco-docker-tika/src/main/resources/templates/transformForm.html +++ b/alfresco-docker-tika/src/main/resources/templates/transformForm.html @@ -26,7 +26,7 @@
timeout
testDelay
notExtractBookmarksText
- + diff --git a/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaControllerTest.java b/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaControllerTest.java index 5349ee89..dd711d73 100644 --- a/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaControllerTest.java +++ b/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaControllerTest.java @@ -345,7 +345,6 @@ public class TikaControllerTest extends AbstractTransformerControllerTest @Test public void pdfToTxtExtractBookmarksTest() throws Exception { - super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true); mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension).param("notExtractBookmarksText", "true")) .andExpect(status().is(200))