mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-05-12 17:04:48 +00:00
Merge branch 'fix/REPO-3626' into 'master'
REPO-3626: added a new parameter notExtractBookmarksText for tika transformer. See merge request Repository/alfresco-docker-transformers!8
This commit is contained in:
commit
d34147fa9a
@ -22,6 +22,7 @@ import org.apache.tika.parser.Parser;
|
|||||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||||
import org.apache.tika.parser.pdf.PDFParser;
|
import org.apache.tika.parser.pdf.PDFParser;
|
||||||
|
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||||
import org.apache.tika.parser.pkg.PackageParser;
|
import org.apache.tika.parser.pkg.PackageParser;
|
||||||
import org.apache.tika.sax.BodyContentHandler;
|
import org.apache.tika.sax.BodyContentHandler;
|
||||||
import org.apache.tika.sax.ExpandedTitleContentHandler;
|
import org.apache.tika.sax.ExpandedTitleContentHandler;
|
||||||
@ -429,6 +430,7 @@ public class Tika
|
|||||||
public static final String TARGET_MIMETYPE = "--targetMimetype=";
|
public static final String TARGET_MIMETYPE = "--targetMimetype=";
|
||||||
public static final String TARGET_ENCODING = "--targetEncoding=";
|
public static final String TARGET_ENCODING = "--targetEncoding=";
|
||||||
public static final String INCLUDE_CONTENTS = "--includeContents";
|
public static final String INCLUDE_CONTENTS = "--includeContents";
|
||||||
|
public static final String NOT_EXTRACT_BOOKMARKS_TEXT = "--notExtractBookmarksText";
|
||||||
|
|
||||||
public static final String CSV = "csv";
|
public static final String CSV = "csv";
|
||||||
public static final String DOC = "doc";
|
public static final String DOC = "doc";
|
||||||
@ -449,6 +451,7 @@ public class Tika
|
|||||||
private Parser autoDetectParser;
|
private Parser autoDetectParser;
|
||||||
private Parser ooXmlParser = new OOXMLParser();
|
private Parser ooXmlParser = new OOXMLParser();
|
||||||
private Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
|
private Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
|
||||||
|
private PDFParserConfig pdfParserConfig = new PDFParserConfig();
|
||||||
|
|
||||||
private DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
|
private DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
|
||||||
{
|
{
|
||||||
@ -505,6 +508,7 @@ public class Tika
|
|||||||
String sourceFilename = null;
|
String sourceFilename = null;
|
||||||
String targetFilename = null;
|
String targetFilename = null;
|
||||||
Boolean includeContents = null;
|
Boolean includeContents = null;
|
||||||
|
Boolean notExtractBookmarksText = null;
|
||||||
|
|
||||||
for (String arg: args)
|
for (String arg: args)
|
||||||
{
|
{
|
||||||
@ -523,6 +527,11 @@ public class Tika
|
|||||||
{
|
{
|
||||||
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
|
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
|
||||||
}
|
}
|
||||||
|
else if (arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT))
|
||||||
|
{
|
||||||
|
getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT);
|
||||||
|
notExtractBookmarksText = true;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
throw new IllegalArgumentException("Unexpected argument "+arg);
|
throw new IllegalArgumentException("Unexpected argument "+arg);
|
||||||
@ -553,8 +562,9 @@ public class Tika
|
|||||||
throw new IllegalArgumentException("Missing arguments");
|
throw new IllegalArgumentException("Missing arguments");
|
||||||
}
|
}
|
||||||
includeContents = includeContents == null ? false : includeContents;
|
includeContents = includeContents == null ? false : includeContents;
|
||||||
|
notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText;
|
||||||
|
|
||||||
transform(transform, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding);
|
transform(transform, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
|
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
|
||||||
@ -577,6 +587,7 @@ public class Tika
|
|||||||
|
|
||||||
// Adds transform specific values such as parser and documentSelector.
|
// Adds transform specific values such as parser and documentSelector.
|
||||||
private void transform(String transform, Boolean includeContents,
|
private void transform(String transform, Boolean includeContents,
|
||||||
|
Boolean notExtractBookmarksText,
|
||||||
String sourceFilename,
|
String sourceFilename,
|
||||||
String targetFilename, String targetMimetype, String targetEncoding)
|
String targetFilename, String targetMimetype, String targetEncoding)
|
||||||
{
|
{
|
||||||
@ -608,11 +619,12 @@ public class Tika
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(parser, documentSelector, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding);
|
transform(parser, documentSelector, includeContents, notExtractBookmarksText, sourceFilename, targetFilename, targetMimetype, targetEncoding);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void transform(Parser parser, DocumentSelector documentSelector, Boolean includeContents,
|
private void transform(Parser parser, DocumentSelector documentSelector, Boolean includeContents,
|
||||||
|
Boolean notExtractBookmarksText,
|
||||||
String sourceFilename,
|
String sourceFilename,
|
||||||
String targetFilename, String targetMimetype, String targetEncoding)
|
String targetFilename, String targetMimetype, String targetEncoding)
|
||||||
{
|
{
|
||||||
@ -626,7 +638,7 @@ public class Tika
|
|||||||
os = new FileOutputStream(targetFilename);
|
os = new FileOutputStream(targetFilename);
|
||||||
ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding));
|
ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding));
|
||||||
Metadata metadata = new Metadata();
|
Metadata metadata = new Metadata();
|
||||||
ParseContext context = buildParseContext(documentSelector, includeContents);
|
ParseContext context = buildParseContext(documentSelector, includeContents, notExtractBookmarksText);
|
||||||
ContentHandler handler = getContentHandler(targetMimetype, ow);
|
ContentHandler handler = getContentHandler(targetMimetype, ow);
|
||||||
|
|
||||||
parser.parse(is, handler, metadata, context);
|
parser.parse(is, handler, metadata, context);
|
||||||
@ -780,15 +792,21 @@ public class Tika
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents)
|
protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents, Boolean notExtractBookmarksText)
|
||||||
{
|
{
|
||||||
ParseContext context = new ParseContext();
|
ParseContext context = new ParseContext();
|
||||||
|
|
||||||
if (documentSelector != null)
|
if (documentSelector != null)
|
||||||
{
|
{
|
||||||
context.set(DocumentSelector.class, documentSelector);
|
context.set(DocumentSelector.class, documentSelector);
|
||||||
}
|
}
|
||||||
|
|
||||||
// pdfParserConfig is never set in the original repo code, so code removed here.
|
if (notExtractBookmarksText.equals(true))
|
||||||
|
{
|
||||||
|
pdfParserConfig.setExtractBookmarksText(false);
|
||||||
|
// pdfParserConfig is set to override default settings
|
||||||
|
context.set(PDFParserConfig.class, pdfParserConfig);
|
||||||
|
}
|
||||||
|
|
||||||
// If Archive transform
|
// If Archive transform
|
||||||
if (includeContents != null)
|
if (includeContents != null)
|
||||||
@ -798,4 +816,5 @@ public class Tika
|
|||||||
|
|
||||||
return context;
|
return context;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -113,7 +113,9 @@ public class TikaController extends AbstractTransformerController
|
|||||||
@RequestParam(value = "testDelay", required = false) Long testDelay,
|
@RequestParam(value = "testDelay", required = false) Long testDelay,
|
||||||
|
|
||||||
@RequestParam(value = "transform") String transform,
|
@RequestParam(value = "transform") String transform,
|
||||||
@RequestParam(value="includeContents", required = false) Boolean includeContents)
|
@RequestParam(value="includeContents", required = false) Boolean includeContents,
|
||||||
|
@RequestParam(value="notExtractBookmarksText", required = false) Boolean notExtractBookmarksText)
|
||||||
|
|
||||||
{
|
{
|
||||||
if (!TRANSFORM_NAMES.contains(transform))
|
if (!TRANSFORM_NAMES.contains(transform))
|
||||||
{
|
{
|
||||||
@ -130,6 +132,7 @@ public class TikaController extends AbstractTransformerController
|
|||||||
|
|
||||||
callTransform(sourceFile, targetFile, transform,
|
callTransform(sourceFile, targetFile, transform,
|
||||||
includeContents != null && includeContents ? INCLUDE_CONTENTS : null,
|
includeContents != null && includeContents ? INCLUDE_CONTENTS : null,
|
||||||
|
notExtractBookmarksText != null && notExtractBookmarksText ? NOT_EXTRACT_BOOKMARKS_TEXT: null,
|
||||||
TARGET_MIMETYPE+targetMimetype, TARGET_ENCODING+targetEncoding);
|
TARGET_MIMETYPE+targetMimetype, TARGET_ENCODING+targetEncoding);
|
||||||
|
|
||||||
return createAttachment(targetFilename, targetFile, testDelay);
|
return createAttachment(targetFilename, targetFile, testDelay);
|
||||||
|
@ -25,8 +25,8 @@
|
|||||||
<tr><td><div style="text-align:right">includeContents (archive) *</div></td><td><input type="checkbox" name="includeContents" value="true" /></td></tr>
|
<tr><td><div style="text-align:right">includeContents (archive) *</div></td><td><input type="checkbox" name="includeContents" value="true" /></td></tr>
|
||||||
<tr><td><div style="text-align:right">timeout</div></td><td><input type="text" name="timeout" value="" /></td></tr>
|
<tr><td><div style="text-align:right">timeout</div></td><td><input type="text" name="timeout" value="" /></td></tr>
|
||||||
<tr><td><div style="text-align:right">testDelay</div></td><td><input type="text" name="testDelay" value="" /></td></tr>
|
<tr><td><div style="text-align:right">testDelay</div></td><td><input type="text" name="testDelay" value="" /></td></tr>
|
||||||
|
<tr><td><div style="text-align:right">notExtractBookmarksText</div></td><td><input type="checkbox" name="notExtractBookmarksText" value="true" /></td></tr>
|
||||||
<tr><td></td><td><input type="submit" value="Transform" /></td></tr>
|
<tr><td></td><td><input type="submit" value="Transform" /></td></tr>
|
||||||
</table>
|
</table>
|
||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
@ -341,4 +341,13 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
|
|||||||
{
|
{
|
||||||
transform(TEXT_MINING, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
|
transform(TEXT_MINING, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void pdfToTxtExtractBookmarksTest() throws Exception
|
||||||
|
{
|
||||||
|
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
|
||||||
|
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension).param("notExtractBookmarksText", "true"))
|
||||||
|
.andExpect(status().is(200))
|
||||||
|
.andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick." + targetExtension));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user