Merge branch 'fix/REPO-3626' into 'master'

REPO-3626: added a new parameter notExtractBookmarksText for tika transformer.

See merge request Repository/alfresco-docker-transformers!8
This commit is contained in:
Andreea Nechifor 2018-07-24 12:50:00 +01:00
commit d34147fa9a
4 changed files with 39 additions and 8 deletions

View File

@ -22,6 +22,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.parser.pkg.PackageParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
@ -429,6 +430,7 @@ public class Tika
public static final String TARGET_MIMETYPE = "--targetMimetype=";
public static final String TARGET_ENCODING = "--targetEncoding=";
public static final String INCLUDE_CONTENTS = "--includeContents";
public static final String NOT_EXTRACT_BOOKMARKS_TEXT = "--notExtractBookmarksText";
public static final String CSV = "csv";
public static final String DOC = "doc";
@ -449,6 +451,7 @@ public class Tika
private Parser autoDetectParser;
private Parser ooXmlParser = new OOXMLParser();
private Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
private PDFParserConfig pdfParserConfig = new PDFParserConfig();
private DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
{
@ -505,6 +508,7 @@ public class Tika
String sourceFilename = null;
String targetFilename = null;
Boolean includeContents = null;
Boolean notExtractBookmarksText = null;
for (String arg: args)
{
@ -523,6 +527,11 @@ public class Tika
{
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
}
else if (arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT))
{
getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT);
notExtractBookmarksText = true;
}
else
{
throw new IllegalArgumentException("Unexpected argument "+arg);
@ -553,8 +562,9 @@ public class Tika
throw new IllegalArgumentException("Missing arguments");
}
includeContents = includeContents == null ? false : includeContents;
notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText;
transform(transform, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding);
transform(transform, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding);
}
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
@ -577,6 +587,7 @@ public class Tika
// Adds transform specific values such as parser and documentSelector.
private void transform(String transform, Boolean includeContents,
Boolean notExtractBookmarksText,
String sourceFilename,
String targetFilename, String targetMimetype, String targetEncoding)
{
@ -608,11 +619,12 @@ public class Tika
break;
}
transform(parser, documentSelector, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding);
transform(parser, documentSelector, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding);
}
private void transform(Parser parser, DocumentSelector documentSelector, Boolean includeContents,
Boolean notExtractBookmarksText,
String sourceFilename,
String targetFilename, String targetMimetype, String targetEncoding)
{
@ -626,7 +638,7 @@ public class Tika
os = new FileOutputStream(targetFilename);
ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding));
Metadata metadata = new Metadata();
ParseContext context = buildParseContext(documentSelector, includeContents);
ParseContext context = buildParseContext(documentSelector, includeContents, notExtractBookmarksText);
ContentHandler handler = getContentHandler(targetMimetype, ow);
parser.parse(is, handler, metadata, context);
@ -780,15 +792,21 @@ public class Tika
}
}
protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents)
protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents, Boolean notExtractBookmarksText)
{
ParseContext context = new ParseContext();
if (documentSelector != null)
{
context.set(DocumentSelector.class, documentSelector);
}
// pdfParserConfig is never set in the original repo code, so code removed here.
if (notExtractBookmarksText.equals(true))
{
pdfParserConfig.setExtractBookmarksText(false);
// pdfParserConfig is set to override default settings
context.set(PDFParserConfig.class, pdfParserConfig);
}
// If Archive transform
if (includeContents != null)
@ -798,4 +816,5 @@ public class Tika
return context;
}
}

View File

@ -113,7 +113,9 @@ public class TikaController extends AbstractTransformerController
@RequestParam(value = "testDelay", required = false) Long testDelay,
@RequestParam(value = "transform") String transform,
@RequestParam(value="includeContents", required = false) Boolean includeContents)
@RequestParam(value="includeContents", required = false) Boolean includeContents,
@RequestParam(value="notExtractBookmarksText", required = false) Boolean notExtractBookmarksText)
{
if (!TRANSFORM_NAMES.contains(transform))
{
@ -130,6 +132,7 @@ public class TikaController extends AbstractTransformerController
callTransform(sourceFile, targetFile, transform,
includeContents != null && includeContents ? INCLUDE_CONTENTS : null,
notExtractBookmarksText != null && notExtractBookmarksText ? NOT_EXTRACT_BOOKMARKS_TEXT: null,
TARGET_MIMETYPE+targetMimetype, TARGET_ENCODING+targetEncoding);
return createAttachment(targetFilename, targetFile, testDelay);

View File

@ -25,8 +25,8 @@
<tr><td><div style="text-align:right">includeContents (archive) *</div></td><td><input type="checkbox" name="includeContents" value="true" /></td></tr>
<tr><td><div style="text-align:right">timeout</div></td><td><input type="text" name="timeout" value="" /></td></tr>
<tr><td><div style="text-align:right">testDelay</div></td><td><input type="text" name="testDelay" value="" /></td></tr>
<tr><td></td><td><input type="submit" value="Transform" /></td></tr>
<tr><td><div style="text-align:right">notExtractBookmarksText</div></td><td><input type="checkbox" name="notExtractBookmarksText" value="true" /></td></tr>
<tr><td></td><td><input type="submit" value="Transform" /></td></tr>
</table>
</form>
</div>

View File

@ -341,4 +341,13 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
{
transform(TEXT_MINING, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
}
@Test
public void pdfToTxtExtractBookmarksTest() throws Exception
{
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension).param("notExtractBookmarksText", "true"))
.andExpect(status().is(200))
.andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick." + targetExtension));
}
}