REPO-3425 Transformers: Tika based transformers

This commit is contained in:
Alan Davis
2018-06-28 13:25:01 +01:00
parent c9ced17097
commit 82c5e3e96a
31 changed files with 1997 additions and 55 deletions

View File

@@ -0,0 +1,27 @@
/*
* #%L
* Alfresco Enterprise Repository
* %%
* Copyright (C) 2005 - 2018 Alfresco Software Limited
* %%
* License rights for this program may be obtained from Alfresco Software, Ltd.
* pursuant to a written agreement and any use of this program without such an
* agreement is prohibited.
* #L%
*/
package org.alfresco.transformer;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
@SpringBootApplication
@EnableAutoConfiguration(exclude={DataSourceAutoConfiguration.class})
public class Application
{
public static void main(String[] args)
{
SpringApplication.run(Application.class, args);
}
}

View File

@@ -0,0 +1,801 @@
/*
* #%L
* Alfresco Enterprise Repository
* %%
* Copyright (C) 2005 - 2018 Alfresco Software Limited
* %%
* License rights for this program may be obtained from Alfresco Software, Ltd.
* pursuant to a written agreement and any use of this program without such an
* agreement is prohibited.
* #L%
*/
package org.alfresco.transformer;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.parser.pkg.PackageParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.net.URL;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
import static org.alfresco.repo.content.MimetypeMap.*;
/**
* Stripped down command line Tika transformers. Not actually run as a separate process, but the code fits the patten
* used by transformers that do.
* <pre>
*
* Archive 0 ms
* 1) cpio html [100] unlimited
* 2) cpio txt [50] unlimited
* 3) cpio xhtml [100] unlimited
* 4) cpio xml [100] unlimited
* 5) jar html [100] unlimited
* 6) jar txt [50] unlimited
* 7) jar xhtml [100] unlimited
* 8) jar xml [100] unlimited
* 9) tar html [100] unlimited
* 10) tar txt [50] unlimited
* 11) tar xhtml [100] unlimited
* 12) tar xml [100] unlimited
* 13) zip html [100] unlimited
* 14) zip txt [50] unlimited
* 15) zip xhtml [100] unlimited
* 16) zip xml [100] unlimited
* PdfBox 0 ms
* 1) pdf html [110] unlimited
* 2) pdf txt [50] 25 MB
* 3) pdf xhtml [110] unlimited
* 4) pdf xml [110] unlimited
* OutlookMsg 0 ms
* 1) msg html [125] unlimited
* 2) msg txt [125] unlimited
* 3) msg xhtml [125] unlimited
* 4) msg xml [125] unlimited
* PdfBox 0 ms
* 1) pdf html [110] unlimited
* 2) pdf txt [50] 25 MB
* 3) pdf xhtml [110] unlimited
* 4) pdf xml [110] unlimited
* Office 0 ms
* 1) doc html [130] unlimited
* 2) doc txt [130] unlimited
* 3) doc xhtml [130] unlimited
* 4) doc xml [130] unlimited
* 5) mpp html [130] unlimited
* 6) mpp txt [130] unlimited
* 7) mpp xhtml [130] unlimited
* 8) mpp xml [130] unlimited
* 9) msg html [130] unlimited
* 10) msg txt [130] unlimited
* 11) msg xhtml [130] unlimited
* 12) msg xml [130] unlimited
* 13) ppt html [130] unlimited
* 14) ppt txt [130] unlimited
* 15) ppt xhtml [130] unlimited
* 16) ppt xml [130] unlimited
* 17) vsd html [130] unlimited
* 18) vsd txt [130] unlimited
* 19) vsd xhtml [130] unlimited
* 20) vsd xml [130] unlimited
* Poi 0 ms
* 1) xls csv [130] unlimited
* 2) xls html [130] unlimited
* 3) xls txt [130] unlimited
* 4) xls xhtml [130] unlimited
* 5) xls xml [130] unlimited
* 6) xlsx csv [130] unlimited
* 7) xlsx html [130] unlimited
* 8) xlsx txt [130] unlimited
* 9) xlsx xhtml [130] unlimited
* 10) xlsx xml [130] unlimited
* OOXML 0 ms
* 1) docm html [130] unlimited
* 2) docm txt [130] unlimited
* 3) docm xhtml [130] unlimited
* 4) docm xml [130] unlimited
* 5) docx html [130] unlimited
* 6) docx txt [130] unlimited
* 7) docx xhtml [130] unlimited
* 8) docx xml [130] unlimited
* 9) dotm html [130] unlimited
* 10) dotm txt [130] unlimited
* 11) dotm xhtml [130] unlimited
* 12) dotm xml [130] unlimited
* 13) dotx html [130] unlimited
* 14) dotx txt [130] unlimited
* 15) dotx xhtml [130] unlimited
* 16) dotx xml [130] unlimited
* 17) potm html [130] unlimited
* 18) potm txt [130] unlimited
* 19) potm xhtml [130] unlimited
* 20) potm xml [130] unlimited
* 21) potx html [130] unlimited
* 22) potx txt [130] unlimited
* 23) potx xhtml [130] unlimited
* 24) potx xml [130] unlimited
* 25) ppam html [130] unlimited
* 26) ppam txt [130] unlimited
* 27) ppam xhtml [130] unlimited
* 28) ppam xml [130] unlimited
* 29) ppsm html [130] unlimited
* 30) ppsm txt [130] unlimited
* 31) ppsm xhtml [130] unlimited
* 32) ppsm xml [130] unlimited
* 33) ppsx html [130] unlimited
* 34) ppsx txt [130] unlimited
* 35) ppsx xhtml [130] unlimited
* 36) ppsx xml [130] unlimited
* 37) pptm html [130] unlimited
* 38) pptm txt [130] unlimited
* 39) pptm xhtml [130] unlimited
* 40) pptm xml [130] unlimited
* 41) pptx html [130] unlimited
* 42) pptx txt [130] unlimited
* 43) pptx xhtml [130] unlimited
* 44) pptx xml [130] unlimited
* 45) sldm html [130] unlimited
* 46) sldm txt [130] unlimited
* 47) sldm xhtml [130] unlimited
* 48) sldm xml [130] unlimited
* 49) sldx html [130] unlimited
* 50) sldx txt [130] unlimited
* 51) sldx xhtml [130] unlimited
* 52) sldx xml [130] unlimited
* 53) xlam html [130] unlimited
* 54) xlam txt [130] unlimited
* 55) xlam xhtml [130] unlimited
* 56) xlam xml [130] unlimited
* 57) xlsb html [130] unlimited
* 58) xlsb txt [130] unlimited
* 59) xlsb xhtml [130] unlimited
* 60) xlsb xml [130] unlimited
* 61) xlsm html [130] unlimited
* 62) xlsm txt [130] unlimited
* 63) xlsm xhtml [130] unlimited
* 64) xlsm xml [130] unlimited
* 65) xlsx html [130] unlimited
* 66) xlsx txt [130] unlimited
* 67) xlsx xhtml [130] unlimited
* 68) xlsx xml [130] unlimited
* 69) xltm html [130] unlimited
* 70) xltm txt [130] unlimited
* 71) xltm xhtml [130] unlimited
* 72) xltm xml [130] unlimited
* 73) xltx html [130] unlimited
* 74) xltx txt [130] unlimited
* 75) xltx xhtml [130] unlimited
* 76) xltx xml [130] unlimited
* TikaAuto 0 ms
* 1) cdf html [120] unlimited
* 2) cdf txt [120] unlimited
* 3) cdf xhtml [120] unlimited
* 4) cdf xml [120] unlimited
* 5) cpio html [120] unlimited
* 6) cpio txt [120] unlimited
* 7) cpio xhtml [120] unlimited
* 8) cpio xml [120] unlimited
* 9) doc html [120] unlimited
* 10) doc txt [120] unlimited
* 11) doc xhtml [120] unlimited
* 12) doc xml [120] unlimited
* 13) docm html [120] unlimited
* 14) docm txt [120] unlimited
* 15) docm xhtml [120] unlimited
* 16) docm xml [120] unlimited
* 17) docx html [120] unlimited
* 18) docx txt [120] unlimited
* 19) docx xhtml [120] unlimited
* 20) docx xml [120] unlimited
* 21) dotm html [120] unlimited
* 22) dotm txt [120] unlimited
* 23) dotm xhtml [120] unlimited
* 24) dotm xml [120] unlimited
* 25) dotx html [120] unlimited
* 26) dotx txt [120] unlimited
* 27) dotx xhtml [120] unlimited
* 28) dotx xml [120] unlimited
* 29) gzip html [120] unlimited
* 30) gzip txt [120] unlimited
* 31) gzip xhtml [120] unlimited
* 32) gzip xml [120] unlimited
* 33) hdf html [120] unlimited
* 34) hdf txt [120] unlimited
* 35) hdf xhtml [120] unlimited
* 36) hdf xml [120] unlimited
* 37) html html [120] unlimited
* 38) html txt [120] unlimited
* 39) html xhtml [120] unlimited
* 40) html xml [120] unlimited
* 41) jar html [120] unlimited
* 42) jar txt [120] unlimited
* 43) jar xhtml [120] unlimited
* 44) jar xml [120] unlimited
* 45) java html [120] unlimited
* 46) java txt [120] unlimited
* 47) java xhtml [120] unlimited
* 48) java xml [120] unlimited
* 49) key html [120] unlimited
* 50) key txt [120] unlimited
* 51) key xhtml [120] unlimited
* 52) key xml [120] unlimited
* 53) mpp html [120] unlimited
* 54) mpp txt [120] unlimited
* 55) mpp xhtml [120] unlimited
* 56) mpp xml [120] unlimited
* 57) numbers html [120] unlimited
* 58) numbers txt [120] unlimited
* 59) numbers xhtml [120] unlimited
* 60) numbers xml [120] unlimited
* 61) odc html [120] unlimited
* 62) odc txt [120] unlimited
* 63) odc xhtml [120] unlimited
* 64) odc xml [120] unlimited
* 65) odi html [120] unlimited
* 66) odi txt [120] unlimited
* 67) odi xhtml [120] unlimited
* 68) odi xml [120] unlimited
* 69) odm html [120] unlimited
* 70) odm txt [120] unlimited
* 71) odm xhtml [120] unlimited
* 72) odm xml [120] unlimited
* 73) odp html [120] unlimited
* 74) odp txt [120] unlimited
* 75) odp xhtml [120] unlimited
* 76) odp xml [120] unlimited
* 77) ods html [120] unlimited
* 78) ods txt [120] unlimited
* 79) ods xhtml [120] unlimited
* 80) ods xml [120] unlimited
* 81) odt html [120] unlimited
* 82) odt txt [120] unlimited
* 83) odt xhtml [120] unlimited
* 84) odt xml [120] unlimited
* 85) ogx html [120] unlimited
* 86) ogx txt [120] unlimited
* 87) ogx xhtml [120] unlimited
* 88) ogx xml [120] unlimited
* 89) oth html [120] unlimited
* 90) oth txt [120] unlimited
* 91) oth xhtml [120] unlimited
* 92) oth xml [120] unlimited
* 93) otp html [120] unlimited
* 94) otp txt [120] unlimited
* 95) otp xhtml [120] unlimited
* 96) otp xml [120] unlimited
* 97) ots html [120] unlimited
* 98) ots txt [120] unlimited
* 99) ots xhtml [120] unlimited
* 100) ots xml [120] unlimited
* 101) ott html [120] unlimited
* 102) ott txt [120] unlimited
* 103) ott xhtml [120] unlimited
* 104) ott xml [120] unlimited
* 105) pages html [120] unlimited
* 106) pages txt [120] unlimited
* 107) pages xhtml [120] unlimited
* 108) pages xml [120] unlimited
* 109) pdf html [120] unlimited
* 110) pdf txt [120] 25 MB
* 111) pdf xhtml [120] unlimited
* 112) pdf xml [120] unlimited
* 113) potm html [120] unlimited
* 114) potm txt [120] unlimited
* 115) potm xhtml [120] unlimited
* 116) potm xml [120] unlimited
* 117) potx html [120] unlimited
* 118) potx txt [120] unlimited
* 119) potx xhtml [120] unlimited
* 120) potx xml [120] unlimited
* 121) ppam html [120] unlimited
* 122) ppam txt [120] unlimited
* 123) ppam xhtml [120] unlimited
* 124) ppam xml [120] unlimited
* 125) ppsm html [120] unlimited
* 126) ppsm txt [120] unlimited
* 127) ppsm xhtml [120] unlimited
* 128) ppsm xml [120] unlimited
* 129) ppsx html [120] unlimited
* 130) ppsx txt [120] unlimited
* 131) ppsx xhtml [120] unlimited
* 132) ppsx xml [120] unlimited
* 133) ppt html [120] unlimited
* 134) ppt txt [120] unlimited
* 135) ppt xhtml [120] unlimited
* 136) ppt xml [120] unlimited
* 137) pptm html [120] unlimited
* 138) pptm txt [120] unlimited
* 139) pptm xhtml [120] unlimited
* 140) pptm xml [120] unlimited
* 141) pptx html [120] unlimited
* 142) pptx txt [120] unlimited
* 143) pptx xhtml [120] unlimited
* 144) pptx xml [120] unlimited
* 145) rar html [120] unlimited
* 146) rar txt [120] unlimited
* 147) rar xhtml [120] unlimited
* 148) rar xml [120] unlimited
* 149) rss html [120] unlimited
* 150) rss txt [120] unlimited
* 151) rss xhtml [120] unlimited
* 152) rss xml [120] unlimited
* 153) rtf html [120] unlimited
* 154) rtf txt [120] unlimited
* 155) rtf xhtml [120] unlimited
* 156) rtf xml [120] unlimited
* 157) sldm html [120] unlimited
* 158) sldm txt [120] unlimited
* 159) sldm xhtml [120] unlimited
* 160) sldm xml [120] unlimited
* 161) sldx html [120] unlimited
* 162) sldx txt [120] unlimited
* 163) sldx xhtml [120] unlimited
* 164) sldx xml [120] unlimited
* 165) sxw html [120] unlimited
* 166) sxw txt [120] unlimited
* 167) sxw xhtml [120] unlimited
* 168) sxw xml [120] unlimited
* 169) txt html [120] unlimited
* 170) txt txt [120] unlimited
* 171) txt xhtml [120] unlimited
* 172) txt xml [120] unlimited
* 173) vsd html [120] unlimited
* 174) vsd txt [120] unlimited
* 175) vsd xhtml [120] unlimited
* 176) vsd xml [120] unlimited
* 177) xhtml html [120] unlimited
* 178) xhtml txt [120] unlimited
* 179) xhtml xhtml [120] unlimited
* 180) xhtml xml [120] unlimited
* 181) xlam html [120] unlimited
* 182) xlam txt [120] unlimited
* 183) xlam xhtml [120] unlimited
* 184) xlam xml [120] unlimited
* 185) xls html [120] unlimited
* 186) xls txt [120] unlimited
* 187) xls xhtml [120] unlimited
* 188) xls xml [120] unlimited
* 189) xlsb html [120] unlimited
* 190) xlsb txt [120] unlimited
* 191) xlsb xhtml [120] unlimited
* 192) xlsb xml [120] unlimited
* 193) xlsm html [120] unlimited
* 194) xlsm txt [120] unlimited
* 195) xlsm xhtml [120] unlimited
* 196) xlsm xml [120] unlimited
* 197) xlsx html [120] unlimited
* 198) xlsx txt [120] unlimited
* 199) xlsx xhtml [120] unlimited
* 200) xlsx xml [120] unlimited
* 201) xltm html [120] unlimited
* 202) xltm txt [120] unlimited
* 203) xltm xhtml [120] unlimited
* 204) xltm xml [120] unlimited
* 205) xltx html [120] unlimited
* 206) xltx txt [120] unlimited
* 207) xltx xhtml [120] unlimited
* 208) xltx xml [120] unlimited
* 209) xml html [120] unlimited
* 210) xml txt [120] unlimited
* 211) xml xhtml [120] unlimited
* 212) xml xml [120] unlimited
* 213) z html [120] unlimited
* 214) z txt [120] unlimited
* 215) z xhtml [120] unlimited
* 216) z xml [120] unlimited
* TextMining 0 ms
* 1) doc html [130] unlimited
* 2) doc txt [50] unlimited
* 3) doc xhtml [130] unlimited
* 4) doc xml [130] unlimited
* </pre>
*/
public class Tika
{
public static final String ARCHIVE = "Archive";
public static final String OUTLOOK_MSG = "OutlookMsg";
public static final String PDF_BOX = "PdfBox";
public static final String POI_OFFICE = "Office";
public static final String POI = "Poi";
public static final String POI_OO_XML = "OOXML";
public static final String TIKA_AUTO = "TikaAuto";
public static final String TEXT_MINING = "TextMining";
public static final List<String> TRANSFORM_NAMES = Arrays.asList(
ARCHIVE, OUTLOOK_MSG, PDF_BOX, POI_OFFICE, POI, POI_OO_XML, TIKA_AUTO, TEXT_MINING);
public static final String TARGET_MIMETYPE = "--targetMimetype=";
public static final String TARGET_ENCODING = "--targetEncoding=";
public static final String INCLUDE_CONTENTS = "--includeContents";
public static final String CSV = "csv";
public static final String DOC = "doc";
public static final String DOCX = "docx";
public static final String HTML = "html";
public static final String MSG = "msg";
public static final String PDF = "pdf";
public static final String PPTX = "pptx";
public static final String TXT = "txt";
public static final String XHTML = "xhtml";
public static final String XSLX = "xslx";
public static final String XML = "xml";
public static final String ZIP = "zip";
private Parser packageParser = new PackageParser();
private Parser pdfParser = new PDFParser();
private Parser officeParser = new OfficeParser();
private Parser autoDetectParser;
private Parser ooXmlParser = new OOXMLParser();
private Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
private DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
{
private List<String> disabledMediaTypes = Arrays.asList(new String[] {MIMETYPE_IMAGE_JPEG, MIMETYPE_IMAGE_TIFF, MIMETYPE_IMAGE_PNG});
@Override
public boolean select(Metadata metadata)
{
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (contentType == null || contentType.equals("") || disabledMediaTypes == null)
{
return true;
}
return !disabledMediaTypes.contains(contentType);
}
};
public Tika() throws TikaException, IOException, SAXException
{
ClassLoader classLoader = getClass().getClassLoader();
URL tikaConfigXml = classLoader.getResource("tika-config.xml");
TikaConfig tikaConfig = new TikaConfig(tikaConfigXml);
autoDetectParser = new AutoDetectParser(tikaConfig);
}
// Method included for developer testing
public static void main(String[] args)
{
long start = System.currentTimeMillis();
try
{
new Tika().transform(args);
}
catch (IllegalArgumentException e)
{
System.err.println("ERROR "+e.getMessage());
System.exit(-1);
}
catch (IllegalStateException | TikaException | IOException | SAXException e)
{
System.err.println("ERROR "+e.getMessage());
e.printStackTrace();
System.exit(-2);
}
System.out.println("Finished in "+(System.currentTimeMillis()-start)+"ms");
}
// Extracts parameters form args
public void transform(String[] args)
{
String transform = null;
String targetMimetype = null;
String targetEncoding = null;
String sourceFilename = null;
String targetFilename = null;
Boolean includeContents = null;
for (String arg: args)
{
if (arg.startsWith("--"))
{
if (INCLUDE_CONTENTS.startsWith(arg))
{
getValue(arg, false, includeContents, INCLUDE_CONTENTS);
includeContents = true;
}
else if (arg.startsWith(TARGET_ENCODING))
{
targetEncoding = getValue(arg, true, targetEncoding, TARGET_ENCODING);
}
else if (arg.startsWith(TARGET_MIMETYPE))
{
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
}
else
{
throw new IllegalArgumentException("Unexpected argument "+arg);
}
}
else
{
if (transform == null)
{
transform = arg;
}
else if (sourceFilename == null)
{
sourceFilename = arg;
}
else if (targetFilename == null)
{
targetFilename = arg;
}
else
{
throw new IllegalArgumentException("Unexpected argument "+arg);
}
}
}
if (targetFilename == null)
{
throw new IllegalArgumentException("Missing arguments");
}
includeContents = includeContents == null ? false : includeContents;
transform(transform, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding);
}
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
{
if (value != null)
{
throw new IllegalArgumentException("Duplicate "+optionName);
}
String stringValue = arg.substring(optionName.length()).trim();
if (!valueExpected && stringValue.length() > 0)
{
throw new IllegalArgumentException("Unexpected value with "+optionName);
}
if (valueExpected && stringValue.length() == 0)
{
throw new IllegalArgumentException("Expected value with "+optionName);
}
return stringValue;
}
// Adds transform specific values such as parser and documentSelector.
private void transform(String transform, Boolean includeContents,
String sourceFilename,
String targetFilename, String targetMimetype, String targetEncoding)
{
Parser parser = null;
DocumentSelector documentSelector = null;
switch(transform)
{
case ARCHIVE:
parser = packageParser;
break;
case OUTLOOK_MSG:
case POI_OFFICE:
case TEXT_MINING:
parser = officeParser;
break;
case PDF_BOX:
parser = pdfParser;
documentSelector = pdfBoxEmbededDocumentSelector;
break;
case POI:
parser = tikaOfficeDetectParser;
break;
case POI_OO_XML:
parser = ooXmlParser;
break;
case TIKA_AUTO:
parser = autoDetectParser;
break;
}
transform(parser, documentSelector, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding);
}
private void transform(Parser parser, DocumentSelector documentSelector, Boolean includeContents,
String sourceFilename,
String targetFilename, String targetMimetype, String targetEncoding)
{
InputStream is = null;
OutputStream os = null;
Writer ow = null;
try
{
is = new BufferedInputStream(new FileInputStream(sourceFilename));
os = new FileOutputStream(targetFilename);
ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding));
Metadata metadata = new Metadata();
ParseContext context = buildParseContext(documentSelector, includeContents);
ContentHandler handler = getContentHandler(targetMimetype, ow);
parser.parse(is, handler, metadata, context);
}
catch (SAXException | TikaException | IOException e)
{
throw new IllegalStateException(e.getMessage(), e);
}
finally
{
if (is != null)
{
try { is.close(); } catch (Throwable e) {}
}
if (os != null)
{
try { os.close(); } catch (Throwable e) {}
}
if (ow != null)
{
try { ow.close(); } catch (Throwable e) {}
}
}
}
protected ContentHandler getContentHandler(String targetMimetype, Writer output)
{
try
{
ContentHandler handler;
if (MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
{
handler = new BodyContentHandler(output);
}
else
{
SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
TransformerHandler transformerHandler = null;
transformerHandler = factory.newTransformerHandler();
transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
transformerHandler.setResult(new StreamResult(output));
handler = transformerHandler;
if (MIMETYPE_HTML.equals(targetMimetype))
{
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, HTML);
return new ExpandedTitleContentHandler(transformerHandler);
}
else if (MIMETYPE_XHTML.equals(targetMimetype) ||
MIMETYPE_XML.equals(targetMimetype))
{
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, XML);
}
else if (MIMETYPE_TEXT_CSV.equals(targetMimetype))
{
handler = new CsvContentHandler(output);
}
else
{
throw new IllegalArgumentException("Invalid target mimetype " + targetMimetype);
}
}
return handler;
}
catch (TransformerConfigurationException e)
{
throw new IllegalStateException(e.getMessage(), e);
}
}
/**
* A wrapper around the normal Tika BodyContentHandler for CSV rather encoding than tab separated.
*/
protected static class CsvContentHandler extends BodyContentHandler {
private static final char[] comma = new char[]{ ',' };
private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+");
private boolean inCell = false;
private boolean needsComma = false;
protected CsvContentHandler(Writer output) {
super(output);
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
if(length == 1 && ch[0] == '\t') {
// Ignore tabs, as they mess up the CSV output
} else {
super.ignorableWhitespace(ch, start, length);
}
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
if(inCell) {
StringBuffer t = new StringBuffer(new String(ch,start,length));
// Quote if not all numbers
if(all_nums.matcher(t).matches())
{
super.characters(ch, start, length);
}
else
{
for(int i=t.length()-1; i>=0; i--) {
if(t.charAt(i) == '\"') {
// Double up double quotes
t.insert(i, '\"');
i--;
}
}
t.insert(0, '\"');
t.append('\"');
char[] c = t.toString().toCharArray();
super.characters(c, 0, c.length);
}
} else {
super.characters(ch, start, length);
}
}
@Override
public void startElement(String uri, String localName, String name,
Attributes atts) throws SAXException {
if(localName.equals("td")) {
inCell = true;
if(needsComma) {
super.characters(comma, 0, 1);
needsComma = true;
}
} else {
super.startElement(uri, localName, name, atts);
}
}
@Override
public void endElement(String uri, String localName, String name)
throws SAXException {
if(localName.equals("td")) {
needsComma = true;
inCell = false;
} else {
if(localName.equals("tr")) {
needsComma = false;
}
super.endElement(uri, localName, name);
}
}
}
protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents)
{
ParseContext context = new ParseContext();
if (documentSelector != null)
{
context.set(DocumentSelector.class, documentSelector);
}
// pdfParserConfig is never set in the original repo code, so code removed here.
// If Archive transform
if (includeContents != null)
{
context.set(Parser.class, includeContents ? autoDetectParser : new EmptyParser());
}
return context;
}
}

View File

@@ -0,0 +1,137 @@
/*
* #%L
* Alfresco Enterprise Repository
* %%
* Copyright (C) 2005 - 2018 Alfresco Software Limited
* %%
* License rights for this program may be obtained from Alfresco Software, Ltd.
* pursuant to a written agreement and any use of this program without such an
* agreement is prohibited.
* #L%
*/
package org.alfresco.transformer;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.exception.TikaException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.Resource;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import org.xml.sax.SAXException;
import javax.servlet.http.HttpServletRequest;
import java.io.File;
import java.io.IOException;
import static org.alfresco.repo.content.MimetypeMap.MIMETYPE_TEXT_PLAIN;
import static org.alfresco.transformer.Tika.*;
/**
* Controller for the Docker based Tika transformers.
*
* Status Codes:
*
* 200 Success
* 400 Bad Request: Invalid target mimetype &lt;mimetype>
* 400 Bad Request: Request parameter &lt;name> is missing (missing mandatory parameter)
* 400 Bad Request: Request parameter &lt;name> is of the wrong type
* 400 Bad Request: Transformer exit code was not 0 (possible problem with the source file)
* 400 Bad Request: The source filename was not supplied
* 500 Internal Server Error: (no message with low level IO problems)
* 500 Internal Server Error: The target filename was not supplied (should not happen as targetExtension is checked)
* 500 Internal Server Error: Transformer version check exit code was not 0
* 500 Internal Server Error: Transformer version check failed to create any output
* 500 Internal Server Error: Could not read the target file
* 500 Internal Server Error: The target filename was malformed (should not happen because of other checks)
* 500 Internal Server Error: Transformer failed to create an output file (the exit code was 0, so there should be some content)
* 500 Internal Server Error: Filename encoding error
* 507 Insufficient Storage: Failed to store the source file
*/
@Controller
public class TikaController extends AbstractTransformerController
{
private Tika tika;
@Autowired
public TikaController() throws TikaException, IOException, SAXException
{
logger = LogFactory.getLog(TikaController.class);
logger.info("--------------------------------------------------------------------------------------------------------------------------------------------------------------");
logEnterpriseLicenseMessage();
logger.info("Tika is from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt");
logger.info("--------------------------------------------------------------------------------------------------------------------------------------------------------------");
tika = new Tika();
}
@Override
protected String getTransformerName()
{
return "Tika";
}
@Override
public void callTransform(String... args)
{
tika.transform(args);
}
@Override
protected String version()
{
return "Tika available";
}
@Override
protected ProbeTestTransform getProbeTestTransform()
{
// See the Javadoc on this method and Probes.md for the choice of these values.
// the livenessPercentage is a little large as Tika does tend to suffer from slow transforms that class with a gc.
return new ProbeTestTransform(this, "quick.pdf", "quick.txt",
60, 16, 400, 10240, 60*30+1, 60*15+20)
{
@Override
protected void executeTransformCommand(File sourceFile, File targetFile)
{
TikaController.this.callTransform(sourceFile, targetFile, PDF_BOX,
TARGET_MIMETYPE+MIMETYPE_TEXT_PLAIN, TARGET_ENCODING+"UTF-8");
}
};
}
@PostMapping("/transform")
public ResponseEntity<Resource> transform(HttpServletRequest request,
@RequestParam("file") MultipartFile sourceMultipartFile,
@RequestParam("targetExtension") String targetExtension,
@RequestParam("targetMimetype") String targetMimetype,
@RequestParam("targetEncoding") String targetEncoding,
@RequestParam(value = "timeout", required = false) Long timeout,
@RequestParam(value = "testDelay", required = false) Long testDelay,
@RequestParam(value = "transform") String transform,
@RequestParam(value="includeContents", required = false) Boolean includeContents)
{
if (!TRANSFORM_NAMES.contains(transform))
{
throw new TransformException(400, "Invalid transform value");
}
String targetFilename = createTargetFileName(sourceMultipartFile, targetExtension);
File sourceFile = createSourceFile(request, sourceMultipartFile);
File targetFile = createTargetFile(request, targetFilename);
// Both files are deleted by TransformInterceptor.afterCompletion
// TODO Consider streaming the request and response rather than using temporary files
// https://www.logicbig.com/tutorials/spring-framework/spring-web-mvc/streaming-response-body.html
callTransform(sourceFile, targetFile, transform,
includeContents != null && includeContents ? INCLUDE_CONTENTS : null,
TARGET_MIMETYPE+targetMimetype, TARGET_ENCODING+targetEncoding);
return createAttachment(targetFilename, targetFile, testDelay);
}
}

View File

@@ -0,0 +1,117 @@
/*
* #%L
* Alfresco Repository
* %%
* Copyright (C) 2005 - 2016 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.util.HashSet;
import java.util.Set;
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
///////// THIS FILE IS A COPY OF THE CODE IN alfresco-repository /////////////
/**
* <a href="http://tika.apache.org/Apache Tika">Apache Tika</a> assumes that
* you either know exactly what your content is, or that
* you'll leave it to auto-detection.
* Within Alfresco, we usually do know. However, from time
* to time, we don't know if we have one of the old or one
* of the new office files (eg .xls and .xlsx).
* This class allows automatically selects the appropriate
* old (OLE2) or new (OOXML) Tika parser as required.
*
* @author Nick Burch
*/
public class TikaOfficeDetectParser implements Parser {
private Parser ole2Parser = new OfficeParser();
private Parser ooxmlParser = new OOXMLParser();
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
Set<MediaType> types = new HashSet<MediaType>();
types.addAll(ole2Parser.getSupportedTypes(parseContext));
types.addAll(ooxmlParser.getSupportedTypes(parseContext));
return types;
}
public void parse(InputStream stream,
ContentHandler handler, Metadata metadata,
ParseContext parseContext) throws IOException, SAXException,
TikaException
{
byte[] initial4 = new byte[4];
InputStream wrapped;
// Preserve TikaInputStreams as TikaInputStreams as they require less memory to process
if (stream.markSupported())
{
stream.mark(initial4.length);
IOUtils.readFully(stream, initial4);
stream.reset();
wrapped = stream;
}
else
{
PushbackInputStream inp = new PushbackInputStream(stream, 4);
IOUtils.readFully(inp, initial4);
inp.unread(initial4);
wrapped = inp;
}
// Which is it?
if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3])
{
ooxmlParser.parse(wrapped, handler, metadata, parseContext);
}
else
{
ole2Parser.parse(wrapped, handler, metadata, parseContext);
}
}
/**
* @deprecated This method will be removed in Apache Tika 1.0.
*/
public void parse(InputStream stream,
ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException
{
parse(stream, handler, metadata, new ParseContext());
}
}

Binary file not shown.

View File

@@ -0,0 +1,39 @@
<html xmlns:th="http://www.thymeleaf.org">
<body>
<div>
<h2>Tika Test Transformations</h2>
<form method="POST" enctype="multipart/form-data" action="/transform">
<table>
<tr><td><div style="text-align:right">transform *</div></td><td><select name="transform">
<option value="Archive">Archive</option>
<option value="OutlookMsg">OutlookMsg</option>
<option selected="selected" value="PdfBox">PdfBox</option>
<option value="Office">Office</option>
<option value="Poi">Poi</option>
<option value="OOXML">OOXML</option>
<option value="TikaAuto">TikaAuto</option>
<option value="TextMining">TextMining</option>
<option value="UNSET"></option>
<option value="BADVALUE">BADVALUE</option>
<option value="MIXED CASE TikaAuto">TikaAuto</option>
</select></td></tr>
<tr><td><div style="text-align:right">file *</div></td><td><input type="file" name="file" /></td></tr>
<tr><td><div style="text-align:right">targetExtension *</div></td><td><input type="text" name="targetExtension" value="txt" /></td></tr>
<tr><td><div style="text-align:right">targetMimetype *</div></td><td><input type="text" name="targetMimetype" value="text/plain" /></td></tr>
<tr><td><div style="text-align:right">targetEncoding *</div></td><td><input type="text" name="targetEncoding" value="UTF-8" /></td></tr>
<tr><td><div style="text-align:right">includeContents (archive) *</div></td><td><input type="checkbox" name="includeContents" value="true" /></td></tr>
<tr><td><div style="text-align:right">timeout</div></td><td><input type="text" name="timeout" value="" /></td></tr>
<tr><td><div style="text-align:right">testDelay</div></td><td><input type="text" name="testDelay" value="" /></td></tr>
<tr><td></td><td><input type="submit" value="Transform" /></td></tr>
</table>
</form>
</div>
<div>
<a href="/log">Log entries</a>
</div>
</body>
</html>

View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<!-- This property, when set, will hide the start up warnings of tika for libraries are missing. -->
<!-- See https://issues.apache.org/jira/browse/TIKA-2490 -->
<service-loader initializableProblemHandler="ignore"/>
</properties>

View File

@@ -0,0 +1,344 @@
/*
* #%L
* Alfresco Repository
* %%
* Copyright (C) 2005 - 2018 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.boot.test.autoconfigure.web.servlet.WebMvcTest;
import org.springframework.boot.test.mock.mockito.SpyBean;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.test.context.junit4.SpringRunner;
import org.springframework.test.web.servlet.MvcResult;
import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilder;
import static org.alfresco.repo.content.MimetypeMap.*;
import static org.alfresco.transformer.Tika.*;
import static org.springframework.test.util.AssertionErrors.assertTrue;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.header;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
/**
* Test the TikaController without a server.
* Super class includes tests for the AbstractTransformerController.
*/
@RunWith(SpringRunner.class)
@WebMvcTest(TikaController.class)
public class TikaControllerTest extends AbstractTransformerControllerTest
{
public static final String EXPECTED_XHTML_CONTENT_CONTAINS = "<p>The quick brown fox jumps over the lazy dog</p>";
public static final String EXPECTED_TEXT_CONTENT_CONTAINS = "The quick brown fox jumps over the lazy dog";
public static final String EXPECTED_MSG_CONTENT_CONTAINS = "Recipients\n" +
"\tmark.rogers@alfresco.com; speedy@quick.com; mrquick@nowhere.com\n" +
"\n" +
"The quick brown fox jumps over the lazy dogs";
public static final String EXPECTED_CSV_CONTENT_CONTAINS = "\"The\",\"quick\",\"brown\",\"fox\"";
@SpyBean
private TikaController controller;
String transform = PDF_BOX;
String targetEncoding = "UTF-8";
String targetMimetype = MIMETYPE_TEXT_PLAIN;
private void transform(String transform, String sourceExtension, String targetExtension,
String sourceMimetype, String targetMimetype,
Boolean includeContents, String expectedContentContains) throws Exception
{
// We don't use targetFileBytes as some of the transforms contain different date text based on the os being used.
super.mockTransformCommand(controller, sourceExtension, targetExtension, sourceMimetype, false);
this.transform = transform;
this.targetMimetype = targetMimetype;
System.out.println("Test "+transform+" "+ sourceExtension +" to "+targetExtension);
MockHttpServletRequestBuilder requestBuilder = includeContents == null
? mockMvcRequest("/transform", sourceFile, "targetExtension", this.targetExtension)
: mockMvcRequest("/transform", sourceFile, "targetExtension", this.targetExtension, "includeContents", includeContents.toString());
MvcResult result = mockMvc.perform(requestBuilder)
.andExpect(status().is(200))
.andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick." + this.targetExtension)).
andReturn();
String content = result.getResponse().getContentAsString();
assertTrue("The content did not include \""+expectedContentContains, content.contains(expectedContentContains));
}
@Override
// Add extra required parameters to the request.
protected MockHttpServletRequestBuilder mockMvcRequest(String url, MockMultipartFile sourceFile, String... params)
{
return super.mockMvcRequest(url, sourceFile, params)
.param("transform", transform)
.param("targetEncoding", targetEncoding)
.param("targetMimetype", targetMimetype);
}
@Test
@Override
public void simpleTransformTest() throws Exception
{
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
super.simpleTransformTest();
}
@Test
@Override
public void testDelayTest() throws Exception
{
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
super.testDelayTest();
}
@Test
@Override
public void badExitCodeTest() throws Exception
{
// Ignore the test in super class as the Tika transforms are real rather than mocked up.
// It is the mock that returns a non zero exit code.
}
@Test
@Override
public void noTargetFileTest() throws Exception
{
// Ignore the test in super class as the Tika transforms are real rather than mocked up.
// It is the mock that returns a zero length file for other transformers, when we supply an invalid targetExtension.
}
// --- Super class tests (need modified setup) ---
@Test
@Override
public void dotDotSourceFilenameTest() throws Exception
{
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
super.dotDotSourceFilenameTest();
}
@Test
@Override
public void noExtensionSourceFilenameTest() throws Exception
{
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
super.noExtensionSourceFilenameTest();
}
@Test
@Override
public void badSourceFilenameTest() throws Exception
{
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
super.badSourceFilenameTest();
}
@Test
@Override
public void blankSourceFilenameTest() throws Exception
{
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
super.blankSourceFilenameTest();
}
@Test
@Override
public void noTargetExtensionTest() throws Exception
{
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
super.noTargetExtensionTest();
}
@Test
@Override
public void calculateMaxTime() throws Exception
{
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
super.calculateMaxTime();
}
// --- General Tika tests ---
@Test
public void badEncodingTest() throws Exception
{
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
targetEncoding = "rubbish";
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension))
.andExpect(status().is(500));
}
// --- Archive ---
@Test
public void zipToTextArchiveTest() throws Exception
{
transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN,false,
"quick.html\n" +
"\n" +
"\n" +
"quick.pdf\n" +
"\n" +
"\n");
}
@Test
public void zipToTextIncludeArchiveTest() throws Exception
{
transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN,true,
"quick.html\n" +
"\n" +
"\n" +
"The quick brown fox jumps over the lazy dog\n" +
"\n" +
"\n" +
"\n" +
"quick.pdf\n" +
"\n" +
"\n" +
"The quick brown fox jumps over the lazy dog" +
"\n" +
"\n");
}
@Test
public void zipToTextExcludeArchiveTest() throws Exception
{
transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN,
false, "\n" +
"folder/subfolder/quick.jpg\n" +
"\n" +
"\n" +
"quick.doc\n" +
"\n" +
"\n" +
"quick.html\n" +
"\n" +
"\n" +
"quick.pdf\n" +
"\n" +
"\n" +
"quick.txt\n" +
"\n" +
"\n" +
"quick.xml\n" +
"\n");
}
// --- OutlookMsg ---
@Test
public void msgToTxtOutlookMsgTest() throws Exception
{
transform(OUTLOOK_MSG, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null, EXPECTED_MSG_CONTENT_CONTAINS);
}
// --- PdfBox ---
@Test
public void pdfToTxtPdfBoxTest() throws Exception
{
transform(PDF_BOX, PDF, TXT, MIMETYPE_PDF, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
}
@Test
public void pdfToCsvPdfBoxTest() throws Exception
{
transform(PDF_BOX, PDF, CSV, MIMETYPE_PDF, MIMETYPE_TEXT_CSV, null, EXPECTED_TEXT_CONTENT_CONTAINS); // Yes it is just text
}
@Test
public void pdfToXmlPdfBoxTest() throws Exception
{
transform(PDF_BOX, PDF, XML, MIMETYPE_PDF, MIMETYPE_XML, null, EXPECTED_XHTML_CONTENT_CONTAINS); // Yes it is just XHTML
}
@Test
public void pdfToXhtmlPdfBoxTest() throws Exception
{
transform(PDF_BOX, PDF, XHTML, MIMETYPE_PDF, MIMETYPE_XHTML, null, EXPECTED_XHTML_CONTENT_CONTAINS);
}
@Test
public void pdfToHtmlPdfBoxTest() throws Exception
{
transform(PDF_BOX, PDF, HTML, MIMETYPE_PDF, MIMETYPE_HTML, null, EXPECTED_XHTML_CONTENT_CONTAINS); // Yes it is just XHTML
}
// --- Office ---
@Test
public void msgToTxtOfficeTest() throws Exception
{
transform(POI_OFFICE, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null, EXPECTED_MSG_CONTENT_CONTAINS);
}
@Test
public void docToTxtOfficeTest() throws Exception
{
transform(POI_OFFICE, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
}
// --- Poi ---
@Test
public void xslxToCsvPoiTest() throws Exception
{
transform(POI, XSLX, CSV, MIMETYPE_OPENXML_SPREADSHEET, MIMETYPE_TEXT_CSV, null, EXPECTED_CSV_CONTENT_CONTAINS);
}
// --- OOXML ---
@Test
public void docxToTxtOoXmlTest() throws Exception
{
transform(POI_OO_XML, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
}
@Test
public void pptxToTxtOoXmlTest() throws Exception
{
transform(POI_OO_XML, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
}
// --- TikaAuto ---
@Test
public void ppxtToTxtTikaAutoTest() throws Exception
{
transform(TIKA_AUTO, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
}
@Test
public void doctToTxtTikaAutoTest() throws Exception
{
transform(TIKA_AUTO, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
}
// --- TextMining ---
@Test
public void docToTxtTextMiningTest() throws Exception
{
transform(TEXT_MINING, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
}
}

View File

@@ -0,0 +1,51 @@
/*
* #%L
* Alfresco Repository
* %%
* Copyright (C) 2005 - 2018 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.context.SpringBootTest.WebEnvironment;
import org.springframework.test.context.junit4.SpringRunner;
/**
* Tests TikaController with a server test harness.
*/
@RunWith(SpringRunner.class)
@SpringBootTest(webEnvironment = WebEnvironment.RANDOM_PORT)
public class TikaHttpRequestTest extends AbstractHttpRequestTest
{
@Override
protected String getTransformerName()
{
return "Tika";
}
@Override
protected String getSourceExtension()
{
return "pdf";
};
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,6 @@
The quick brown fox jumps over the lazy dog
Blank Page

Binary file not shown.

Binary file not shown.