Merge branch 'fix/REPO-3626' into 'master'

REPO-3626: added a new parameter notExtractBookmarksText for tika transformer.

See merge request Repository/alfresco-docker-transformers!8
This commit is contained in:
Andreea Nechifor 2018-07-24 12:50:00 +01:00
commit d34147fa9a
4 changed files with 39 additions and 8 deletions

View File

@ -22,6 +22,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.parser.pkg.PackageParser; import org.apache.tika.parser.pkg.PackageParser;
import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler; import org.apache.tika.sax.ExpandedTitleContentHandler;
@ -429,6 +430,7 @@ public class Tika
public static final String TARGET_MIMETYPE = "--targetMimetype="; public static final String TARGET_MIMETYPE = "--targetMimetype=";
public static final String TARGET_ENCODING = "--targetEncoding="; public static final String TARGET_ENCODING = "--targetEncoding=";
public static final String INCLUDE_CONTENTS = "--includeContents"; public static final String INCLUDE_CONTENTS = "--includeContents";
public static final String NOT_EXTRACT_BOOKMARKS_TEXT = "--notExtractBookmarksText";
public static final String CSV = "csv"; public static final String CSV = "csv";
public static final String DOC = "doc"; public static final String DOC = "doc";
@ -449,6 +451,7 @@ public class Tika
private Parser autoDetectParser; private Parser autoDetectParser;
private Parser ooXmlParser = new OOXMLParser(); private Parser ooXmlParser = new OOXMLParser();
private Parser tikaOfficeDetectParser = new TikaOfficeDetectParser(); private Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
private PDFParserConfig pdfParserConfig = new PDFParserConfig();
private DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector() private DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
{ {
@ -505,6 +508,7 @@ public class Tika
String sourceFilename = null; String sourceFilename = null;
String targetFilename = null; String targetFilename = null;
Boolean includeContents = null; Boolean includeContents = null;
Boolean notExtractBookmarksText = null;
for (String arg: args) for (String arg: args)
{ {
@ -523,6 +527,11 @@ public class Tika
{ {
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE); targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
} }
else if (arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT))
{
getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT);
notExtractBookmarksText = true;
}
else else
{ {
throw new IllegalArgumentException("Unexpected argument "+arg); throw new IllegalArgumentException("Unexpected argument "+arg);
@ -553,8 +562,9 @@ public class Tika
throw new IllegalArgumentException("Missing arguments"); throw new IllegalArgumentException("Missing arguments");
} }
includeContents = includeContents == null ? false : includeContents; includeContents = includeContents == null ? false : includeContents;
notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText;
transform(transform, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding); transform(transform, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding);
} }
private String getValue(String arg, boolean valueExpected, Object value, String optionName) private String getValue(String arg, boolean valueExpected, Object value, String optionName)
@ -577,6 +587,7 @@ public class Tika
// Adds transform specific values such as parser and documentSelector. // Adds transform specific values such as parser and documentSelector.
private void transform(String transform, Boolean includeContents, private void transform(String transform, Boolean includeContents,
Boolean notExtractBookmarksText,
String sourceFilename, String sourceFilename,
String targetFilename, String targetMimetype, String targetEncoding) String targetFilename, String targetMimetype, String targetEncoding)
{ {
@ -608,11 +619,12 @@ public class Tika
break; break;
} }
transform(parser, documentSelector, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding); transform(parser, documentSelector, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding);
} }
private void transform(Parser parser, DocumentSelector documentSelector, Boolean includeContents, private void transform(Parser parser, DocumentSelector documentSelector, Boolean includeContents,
Boolean notExtractBookmarksText,
String sourceFilename, String sourceFilename,
String targetFilename, String targetMimetype, String targetEncoding) String targetFilename, String targetMimetype, String targetEncoding)
{ {
@ -626,7 +638,7 @@ public class Tika
os = new FileOutputStream(targetFilename); os = new FileOutputStream(targetFilename);
ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding)); ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding));
Metadata metadata = new Metadata(); Metadata metadata = new Metadata();
ParseContext context = buildParseContext(documentSelector, includeContents); ParseContext context = buildParseContext(documentSelector, includeContents, notExtractBookmarksText);
ContentHandler handler = getContentHandler(targetMimetype, ow); ContentHandler handler = getContentHandler(targetMimetype, ow);
parser.parse(is, handler, metadata, context); parser.parse(is, handler, metadata, context);
@ -780,15 +792,21 @@ public class Tika
} }
} }
protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents) protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents, Boolean notExtractBookmarksText)
{ {
ParseContext context = new ParseContext(); ParseContext context = new ParseContext();
if (documentSelector != null) if (documentSelector != null)
{ {
context.set(DocumentSelector.class, documentSelector); context.set(DocumentSelector.class, documentSelector);
} }
// pdfParserConfig is never set in the original repo code, so code removed here. if (notExtractBookmarksText.equals(true))
{
pdfParserConfig.setExtractBookmarksText(false);
// pdfParserConfig is set to override default settings
context.set(PDFParserConfig.class, pdfParserConfig);
}
// If Archive transform // If Archive transform
if (includeContents != null) if (includeContents != null)
@ -798,4 +816,5 @@ public class Tika
return context; return context;
} }
} }

View File

@ -113,7 +113,9 @@ public class TikaController extends AbstractTransformerController
@RequestParam(value = "testDelay", required = false) Long testDelay, @RequestParam(value = "testDelay", required = false) Long testDelay,
@RequestParam(value = "transform") String transform, @RequestParam(value = "transform") String transform,
@RequestParam(value="includeContents", required = false) Boolean includeContents) @RequestParam(value="includeContents", required = false) Boolean includeContents,
@RequestParam(value="notExtractBookmarksText", required = false) Boolean notExtractBookmarksText)
{ {
if (!TRANSFORM_NAMES.contains(transform)) if (!TRANSFORM_NAMES.contains(transform))
{ {
@ -130,6 +132,7 @@ public class TikaController extends AbstractTransformerController
callTransform(sourceFile, targetFile, transform, callTransform(sourceFile, targetFile, transform,
includeContents != null && includeContents ? INCLUDE_CONTENTS : null, includeContents != null && includeContents ? INCLUDE_CONTENTS : null,
notExtractBookmarksText != null && notExtractBookmarksText ? NOT_EXTRACT_BOOKMARKS_TEXT: null,
TARGET_MIMETYPE+targetMimetype, TARGET_ENCODING+targetEncoding); TARGET_MIMETYPE+targetMimetype, TARGET_ENCODING+targetEncoding);
return createAttachment(targetFilename, targetFile, testDelay); return createAttachment(targetFilename, targetFile, testDelay);

View File

@ -25,8 +25,8 @@
<tr><td><div style="text-align:right">includeContents (archive) *</div></td><td><input type="checkbox" name="includeContents" value="true" /></td></tr> <tr><td><div style="text-align:right">includeContents (archive) *</div></td><td><input type="checkbox" name="includeContents" value="true" /></td></tr>
<tr><td><div style="text-align:right">timeout</div></td><td><input type="text" name="timeout" value="" /></td></tr> <tr><td><div style="text-align:right">timeout</div></td><td><input type="text" name="timeout" value="" /></td></tr>
<tr><td><div style="text-align:right">testDelay</div></td><td><input type="text" name="testDelay" value="" /></td></tr> <tr><td><div style="text-align:right">testDelay</div></td><td><input type="text" name="testDelay" value="" /></td></tr>
<tr><td><div style="text-align:right">notExtractBookmarksText</div></td><td><input type="checkbox" name="notExtractBookmarksText" value="true" /></td></tr>
<tr><td></td><td><input type="submit" value="Transform" /></td></tr> <tr><td></td><td><input type="submit" value="Transform" /></td></tr>
</table> </table>
</form> </form>
</div> </div>

View File

@ -341,4 +341,13 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
{ {
transform(TEXT_MINING, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS); transform(TEXT_MINING, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
} }
@Test
public void pdfToTxtExtractBookmarksTest() throws Exception
{
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension).param("notExtractBookmarksText", "true"))
.andExpect(status().is(200))
.andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick." + targetExtension));
}
} }