mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-05-12 17:04:48 +00:00
Merge branch 'fix/REPO-3626' into 'master'
REPO-3626: added a new parameter notExtractBookmarksText for tika transformer. See merge request Repository/alfresco-docker-transformers!8
This commit is contained in:
commit
d34147fa9a
@ -22,6 +22,7 @@ import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||
import org.apache.tika.parser.pkg.PackageParser;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.apache.tika.sax.ExpandedTitleContentHandler;
|
||||
@ -429,6 +430,7 @@ public class Tika
|
||||
public static final String TARGET_MIMETYPE = "--targetMimetype=";
|
||||
public static final String TARGET_ENCODING = "--targetEncoding=";
|
||||
public static final String INCLUDE_CONTENTS = "--includeContents";
|
||||
public static final String NOT_EXTRACT_BOOKMARKS_TEXT = "--notExtractBookmarksText";
|
||||
|
||||
public static final String CSV = "csv";
|
||||
public static final String DOC = "doc";
|
||||
@ -449,6 +451,7 @@ public class Tika
|
||||
private Parser autoDetectParser;
|
||||
private Parser ooXmlParser = new OOXMLParser();
|
||||
private Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
|
||||
private PDFParserConfig pdfParserConfig = new PDFParserConfig();
|
||||
|
||||
private DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
|
||||
{
|
||||
@ -505,6 +508,7 @@ public class Tika
|
||||
String sourceFilename = null;
|
||||
String targetFilename = null;
|
||||
Boolean includeContents = null;
|
||||
Boolean notExtractBookmarksText = null;
|
||||
|
||||
for (String arg: args)
|
||||
{
|
||||
@ -523,6 +527,11 @@ public class Tika
|
||||
{
|
||||
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
|
||||
}
|
||||
else if (arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT))
|
||||
{
|
||||
getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT);
|
||||
notExtractBookmarksText = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected argument "+arg);
|
||||
@ -553,8 +562,9 @@ public class Tika
|
||||
throw new IllegalArgumentException("Missing arguments");
|
||||
}
|
||||
includeContents = includeContents == null ? false : includeContents;
|
||||
notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText;
|
||||
|
||||
transform(transform, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding);
|
||||
transform(transform, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding);
|
||||
}
|
||||
|
||||
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
|
||||
@ -577,6 +587,7 @@ public class Tika
|
||||
|
||||
// Adds transform specific values such as parser and documentSelector.
|
||||
private void transform(String transform, Boolean includeContents,
|
||||
Boolean notExtractBookmarksText,
|
||||
String sourceFilename,
|
||||
String targetFilename, String targetMimetype, String targetEncoding)
|
||||
{
|
||||
@ -608,11 +619,12 @@ public class Tika
|
||||
break;
|
||||
}
|
||||
|
||||
transform(parser, documentSelector, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding);
|
||||
transform(parser, documentSelector, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding);
|
||||
}
|
||||
|
||||
|
||||
private void transform(Parser parser, DocumentSelector documentSelector, Boolean includeContents,
|
||||
Boolean notExtractBookmarksText,
|
||||
String sourceFilename,
|
||||
String targetFilename, String targetMimetype, String targetEncoding)
|
||||
{
|
||||
@ -626,7 +638,7 @@ public class Tika
|
||||
os = new FileOutputStream(targetFilename);
|
||||
ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding));
|
||||
Metadata metadata = new Metadata();
|
||||
ParseContext context = buildParseContext(documentSelector, includeContents);
|
||||
ParseContext context = buildParseContext(documentSelector, includeContents, notExtractBookmarksText);
|
||||
ContentHandler handler = getContentHandler(targetMimetype, ow);
|
||||
|
||||
parser.parse(is, handler, metadata, context);
|
||||
@ -780,15 +792,21 @@ public class Tika
|
||||
}
|
||||
}
|
||||
|
||||
protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents)
|
||||
protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents, Boolean notExtractBookmarksText)
|
||||
{
|
||||
ParseContext context = new ParseContext();
|
||||
|
||||
if (documentSelector != null)
|
||||
{
|
||||
context.set(DocumentSelector.class, documentSelector);
|
||||
}
|
||||
|
||||
// pdfParserConfig is never set in the original repo code, so code removed here.
|
||||
if (notExtractBookmarksText.equals(true))
|
||||
{
|
||||
pdfParserConfig.setExtractBookmarksText(false);
|
||||
// pdfParserConfig is set to override default settings
|
||||
context.set(PDFParserConfig.class, pdfParserConfig);
|
||||
}
|
||||
|
||||
// If Archive transform
|
||||
if (includeContents != null)
|
||||
@ -798,4 +816,5 @@ public class Tika
|
||||
|
||||
return context;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -113,7 +113,9 @@ public class TikaController extends AbstractTransformerController
|
||||
@RequestParam(value = "testDelay", required = false) Long testDelay,
|
||||
|
||||
@RequestParam(value = "transform") String transform,
|
||||
@RequestParam(value="includeContents", required = false) Boolean includeContents)
|
||||
@RequestParam(value="includeContents", required = false) Boolean includeContents,
|
||||
@RequestParam(value="notExtractBookmarksText", required = false) Boolean notExtractBookmarksText)
|
||||
|
||||
{
|
||||
if (!TRANSFORM_NAMES.contains(transform))
|
||||
{
|
||||
@ -130,6 +132,7 @@ public class TikaController extends AbstractTransformerController
|
||||
|
||||
callTransform(sourceFile, targetFile, transform,
|
||||
includeContents != null && includeContents ? INCLUDE_CONTENTS : null,
|
||||
notExtractBookmarksText != null && notExtractBookmarksText ? NOT_EXTRACT_BOOKMARKS_TEXT: null,
|
||||
TARGET_MIMETYPE+targetMimetype, TARGET_ENCODING+targetEncoding);
|
||||
|
||||
return createAttachment(targetFilename, targetFile, testDelay);
|
||||
|
@ -25,7 +25,7 @@
|
||||
<tr><td><div style="text-align:right">includeContents (archive) *</div></td><td><input type="checkbox" name="includeContents" value="true" /></td></tr>
|
||||
<tr><td><div style="text-align:right">timeout</div></td><td><input type="text" name="timeout" value="" /></td></tr>
|
||||
<tr><td><div style="text-align:right">testDelay</div></td><td><input type="text" name="testDelay" value="" /></td></tr>
|
||||
|
||||
<tr><td><div style="text-align:right">notExtractBookmarksText</div></td><td><input type="checkbox" name="notExtractBookmarksText" value="true" /></td></tr>
|
||||
<tr><td></td><td><input type="submit" value="Transform" /></td></tr>
|
||||
</table>
|
||||
</form>
|
||||
|
@ -341,4 +341,13 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
|
||||
{
|
||||
transform(TEXT_MINING, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pdfToTxtExtractBookmarksTest() throws Exception
|
||||
{
|
||||
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
|
||||
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension).param("notExtractBookmarksText", "true"))
|
||||
.andExpect(status().is(200))
|
||||
.andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick." + targetExtension));
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user