mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-08-07 17:49:17 +00:00
Initial Tika support for Text content transforms
The POI HSSF transformer has been updated to use Tika. A Tika auto-detect transformer has also been added, which caters for a large number of previously un-handled cases. Unit tests check this. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20769 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -363,6 +363,10 @@
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<bean id="transformer.TikaAuto"
|
||||
class="org.alfresco.repo.content.transform.TikaAutoContentTransformer"
|
||||
parent="baseContentTransformer" />
|
||||
|
||||
<bean id="transformer.Poi"
|
||||
class="org.alfresco.repo.content.transform.PoiHssfContentTransformer"
|
||||
parent="baseContentTransformer" />
|
||||
|
@@ -28,6 +28,7 @@ import org.alfresco.repo.content.metadata.OpenOfficeMetadataExtracterTest;
|
||||
import org.alfresco.repo.content.metadata.PdfBoxMetadataExtracterTest;
|
||||
import org.alfresco.repo.content.metadata.PoiMetadataExtracterTest;
|
||||
import org.alfresco.repo.content.metadata.RFC822MetadataExtracterTest;
|
||||
import org.alfresco.repo.content.metadata.TikaAutoMetadataExtracterTest;
|
||||
import org.alfresco.repo.content.transform.BinaryPassThroughContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.ComplexContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.ContentTransformerRegistryTest;
|
||||
@@ -41,6 +42,7 @@ import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTe
|
||||
import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.TextMiningContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.TextToPdfContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.TikaAutoContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.magick.ImageMagickContentTransformerTest;
|
||||
import org.alfresco.util.ApplicationContextHelper;
|
||||
import org.springframework.context.ApplicationContext;
|
||||
@@ -91,6 +93,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
|
||||
suite.addTestSuite( PdfBoxMetadataExtracterTest.class );
|
||||
suite.addTestSuite( PoiMetadataExtracterTest.class );
|
||||
suite.addTestSuite( RFC822MetadataExtracterTest.class );
|
||||
suite.addTestSuite( TikaAutoMetadataExtracterTest.class );
|
||||
|
||||
// Transform tests
|
||||
suite.addTestSuite(BinaryPassThroughContentTransformerTest.class);
|
||||
@@ -106,6 +109,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
|
||||
suite.addTestSuite(StringExtractingContentTransformerTest.class);
|
||||
suite.addTestSuite(TextMiningContentTransformerTest.class);
|
||||
suite.addTestSuite(TextToPdfContentTransformerTest.class);
|
||||
suite.addTestSuite(TikaAutoContentTransformerTest.class);
|
||||
suite.addTestSuite(ImageMagickContentTransformerTest.class);
|
||||
|
||||
return suite;
|
||||
|
@@ -52,6 +52,7 @@ public class MimetypeMap implements MimetypeService
|
||||
public static final String MIMETYPE_TEXT_PLAIN = "text/plain";
|
||||
public static final String MIMETYPE_TEXT_MEDIAWIKI = "text/mediawiki";
|
||||
public static final String MIMETYPE_TEXT_CSS = "text/css";
|
||||
public static final String MIMETYPE_TEXT_CSV = "text/csv";
|
||||
public static final String MIMETYPE_TEXT_JAVASCRIPT = "text/javascript";
|
||||
public static final String MIMETYPE_XML = "text/xml";
|
||||
public static final String MIMETYPE_HTML = "text/html";
|
||||
|
@@ -238,6 +238,9 @@ public abstract class AbstractContentTransformerTest extends TestCase
|
||||
" source: " + sourceReader + "\n" +
|
||||
" target: " + targetWriter,
|
||||
checkContent.contains(QUICK_CONTENT));
|
||||
|
||||
// Let subclasses do extra checks if they want
|
||||
additionalContentCheck(sourceMimetype, targetMimetype, checkContent);
|
||||
}
|
||||
else if (isQuickWordsExpected(targetMimetype))
|
||||
{
|
||||
@@ -281,6 +284,13 @@ public abstract class AbstractContentTransformerTest extends TestCase
|
||||
outputWriter.putContent(sb.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Allows implementations to do some extra checks on the
|
||||
* results of the content as found by
|
||||
* {@link #testAllConversions()}
|
||||
*/
|
||||
protected void additionalContentCheck(String sourceMimetype, String targetMimetype, String contents) {}
|
||||
|
||||
/**
|
||||
* This method is an extension point for enabling/disabling an assertion that the "quick brown fox"
|
||||
* phrase is present in the transformed content.
|
||||
|
@@ -18,267 +18,164 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Writer;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import javax.xml.transform.TransformerConfigurationException;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.poi.hssf.usermodel.HSSFCell;
|
||||
import org.apache.poi.hssf.usermodel.HSSFRow;
|
||||
import org.apache.poi.hssf.usermodel.HSSFSheet;
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
import org.apache.poi.util.RecordFormatException;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
/**
|
||||
* Makes use of the {@link http://jakarta.apache.org/poi/ POI} library to
|
||||
* perform conversions from Excel spreadsheets to text (comma separated).
|
||||
* <p>
|
||||
* While most text extraction from spreadsheets only extract the first sheet of
|
||||
* the workbook, the method used here extracts the text from <b>all the sheets</b>.
|
||||
* This is more useful, especially when it comes to indexing spreadsheets.
|
||||
* <p>
|
||||
* In the case where there is only one sheet in the document, the results will be
|
||||
* exactly the same as most extractors. Where there are multiple sheets, the results
|
||||
* will differ, but meaningful reimporting of the text document is not possible
|
||||
* anyway.
|
||||
* Uses {@link http://tika.apache.org/ Apache Tika} and
|
||||
* {@link http://poi.apache.org/ Apache POI} to perform
|
||||
* conversions from Excel spreadsheets.
|
||||
* <p>Will transform from Excel spreadsheets into Html,
|
||||
* Xml or Text (space or comma separated)
|
||||
* <p>Handles all sheets in the file.
|
||||
*
|
||||
* TODO CSV Support
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public class PoiHssfContentTransformer extends AbstractContentTransformer2
|
||||
public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
|
||||
{
|
||||
/**
|
||||
* Error message to delegate to NodeInfoBean
|
||||
*/
|
||||
public static final String WRONG_FORMAT_MESSAGE_ID = "transform.err.format_or_password";
|
||||
|
||||
/**
|
||||
* Windows carriage return line feed pair.
|
||||
*/
|
||||
private static final String LINE_BREAK = "\r\n";
|
||||
private static Log logger = LogFactory.getLog(PoiHssfContentTransformer.class);
|
||||
|
||||
public PoiHssfContentTransformer()
|
||||
{
|
||||
super(new String[] {
|
||||
MimetypeMap.MIMETYPE_EXCEL
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return new OfficeParser();
|
||||
}
|
||||
|
||||
/**
|
||||
* Currently the only transformation performed is that of text extraction from XLS documents.
|
||||
* Can we do the requested transformation via Tika?
|
||||
* We support transforming to HTML, XML, Text or CSV
|
||||
*/
|
||||
@Override
|
||||
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
|
||||
{
|
||||
if (!MimetypeMap.MIMETYPE_EXCEL.equals(sourceMimetype) ||
|
||||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
||||
{
|
||||
// only support XLS -> Text
|
||||
return false;
|
||||
}
|
||||
else
|
||||
if(sourceMimeTypes.contains(sourceMimetype) &&
|
||||
MimetypeMap.MIMETYPE_TEXT_CSV.equals(targetMimetype))
|
||||
{
|
||||
// Special case for CSV
|
||||
return true;
|
||||
}
|
||||
|
||||
// Otherwise fall back on the default Tika rules
|
||||
return super.isTransformable(sourceMimetype, targetMimetype, options);
|
||||
}
|
||||
|
||||
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
|
||||
throws Exception
|
||||
@Override
|
||||
protected ContentHandler getContentHandler(String targetMimeType, Writer output)
|
||||
throws TransformerConfigurationException
|
||||
{
|
||||
InputStream is = reader.getContentInputStream();
|
||||
OutputStream os = writer.getContentOutputStream();
|
||||
String encoding = writer.getEncoding();
|
||||
try
|
||||
if(MimetypeMap.MIMETYPE_TEXT_CSV.equals(targetMimeType))
|
||||
{
|
||||
// open the workbook
|
||||
HSSFWorkbook workbook = new HSSFWorkbook(is);
|
||||
// how many sheets are there?
|
||||
int sheetCount = workbook.getNumberOfSheets();
|
||||
// transform each sheet
|
||||
for (int i = 0; i < sheetCount; i++)
|
||||
{
|
||||
HSSFSheet sheet = workbook.getSheetAt(i);
|
||||
String sheetName = workbook.getSheetName(i);
|
||||
writeSheet(os, sheet, encoding);
|
||||
// write the sheet name
|
||||
PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
|
||||
PoiHssfContentTransformer.writeString(os, encoding, "End of sheet: " + sheetName, true);
|
||||
PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
|
||||
PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
|
||||
return new CsvContentHandler(output);
|
||||
}
|
||||
}
|
||||
catch (RecordFormatException ex)
|
||||
{
|
||||
// Catching specific exception to propagate it to NodeInfoBean
|
||||
// to fix issue https://issues.alfresco.com/jira/browse/ETWOTWO-440
|
||||
|
||||
logger.error(ex);
|
||||
throw new TransformerInfoException(WRONG_FORMAT_MESSAGE_ID, ex);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (Throwable e) {}
|
||||
}
|
||||
if (os != null)
|
||||
{
|
||||
try { os.close(); } catch (Throwable e) {}
|
||||
}
|
||||
}
|
||||
// Otherwise use the normal Tika rules
|
||||
return super.getContentHandler(targetMimeType, output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Dumps the text from the sheet to the stream in CSV format
|
||||
* A wrapper around the normal Tika BodyContentHandler,
|
||||
* which causes things to be CSV encoded rather than
|
||||
* tab separated
|
||||
* TODO Get rid of the extra tabs that crop up
|
||||
*/
|
||||
private void writeSheet(OutputStream os, HSSFSheet sheet, String encoding) throws Exception
|
||||
{
|
||||
int rows = sheet.getLastRowNum();
|
||||
// transform each row
|
||||
for (int i = 0; i <= rows; i++)
|
||||
{
|
||||
HSSFRow row = sheet.getRow(i);
|
||||
if (row != null)
|
||||
{
|
||||
writeRow(os, row, encoding);
|
||||
}
|
||||
// break between rows
|
||||
if (i < rows)
|
||||
{
|
||||
PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
|
||||
}
|
||||
}
|
||||
protected static class CsvContentHandler extends BodyContentHandler {
|
||||
private static final char[] comma = new char[]{ ',' };
|
||||
private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+");
|
||||
|
||||
private boolean inCell = false;
|
||||
private boolean needsComma = false;
|
||||
|
||||
protected CsvContentHandler(Writer output) {
|
||||
super(output);
|
||||
}
|
||||
|
||||
private void writeRow(OutputStream os, HSSFRow row, String encoding) throws Exception
|
||||
{
|
||||
short firstCellNum = row.getFirstCellNum();
|
||||
short lastCellNum = row.getLastCellNum();
|
||||
// pad out to first cell
|
||||
for (int i = 0; i < firstCellNum; i++)
|
||||
{
|
||||
PoiHssfContentTransformer.writeString(os, encoding, ",", false); // CSV up to first cell
|
||||
}
|
||||
// write each cell
|
||||
for (int i = 0; i <= lastCellNum; i++)
|
||||
{
|
||||
HSSFCell cell = row.getCell(i);
|
||||
if (cell != null)
|
||||
{
|
||||
int cellType = cell.getCellType();
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length)
|
||||
throws SAXException {
|
||||
if(inCell) {
|
||||
StringBuffer t = new StringBuffer(new String(ch,start,length));
|
||||
|
||||
StringBuilder sb = new StringBuilder(10);
|
||||
switch (cellType)
|
||||
// Quote if not all numbers
|
||||
if(all_nums.matcher(t).matches())
|
||||
{
|
||||
case HSSFCell.CELL_TYPE_BLANK:
|
||||
// ignore
|
||||
break;
|
||||
case HSSFCell.CELL_TYPE_BOOLEAN:
|
||||
sb.append(cell.getBooleanCellValue());
|
||||
break;
|
||||
case HSSFCell.CELL_TYPE_ERROR:
|
||||
sb.append("ERROR");
|
||||
break;
|
||||
case HSSFCell.CELL_TYPE_NUMERIC:
|
||||
sb.append(cell.getNumericCellValue());
|
||||
break;
|
||||
case HSSFCell.CELL_TYPE_STRING:
|
||||
sb.append(cell.getStringCellValue());
|
||||
break;
|
||||
case HSSFCell.CELL_TYPE_FORMULA:
|
||||
final int formulaResultType = cell.getCachedFormulaResultType();
|
||||
if (HSSFCell.CELL_TYPE_NUMERIC == formulaResultType)
|
||||
{
|
||||
sb.append(cell.getNumericCellValue());
|
||||
}
|
||||
else if (HSSFCell.CELL_TYPE_STRING == formulaResultType)
|
||||
{
|
||||
sb.append(cell.getStringCellValue());
|
||||
}
|
||||
else if (HSSFCell.CELL_TYPE_BOOLEAN == formulaResultType)
|
||||
{
|
||||
sb.append(cell.getBooleanCellValue());
|
||||
}
|
||||
else if (HSSFCell.CELL_TYPE_ERROR == formulaResultType)
|
||||
{
|
||||
sb.append(cell.getErrorCellValue());
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new RuntimeException("Unknown formula result type: " + formulaResultType);
|
||||
for(int i=t.length()-1; i>=0; i--) {
|
||||
if(t.charAt(i) == '\"') {
|
||||
// Double up double quotes
|
||||
t.insert(i, '\"');
|
||||
i--;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException("Unknown HSSF cell type: " + cell);
|
||||
}
|
||||
String data = sb.toString();
|
||||
PoiHssfContentTransformer.writeString(os, encoding, data, true);
|
||||
}
|
||||
// comma separate if required
|
||||
if (i < lastCellNum)
|
||||
{
|
||||
PoiHssfContentTransformer.writeString(os, encoding, ",", false);
|
||||
t.insert(0, '\"');
|
||||
t.append('\"');
|
||||
char[] c = t.toString().toCharArray();
|
||||
super.characters(c, 0, c.length);
|
||||
}
|
||||
} else {
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the given data to the stream using the encoding specified. If the encoding
|
||||
* is not given, the default <tt>String</tt> to <tt>byte[]</tt> conversion will be
|
||||
* used.
|
||||
* <p>
|
||||
* The given data string will be escaped appropriately.
|
||||
*
|
||||
* @param os the stream to write to
|
||||
* @param encoding the encoding to use, or null if the default encoding is acceptable
|
||||
* @param value the string to write
|
||||
* @param isData true if the value represents a human-readable string, false if the
|
||||
* value represents formatting characters, separating characters, etc.
|
||||
* @throws Exception
|
||||
*/
|
||||
public static void writeString(OutputStream os, String encoding, String value, boolean isData) throws Exception
|
||||
{
|
||||
if (value == null)
|
||||
{
|
||||
// nothing to do
|
||||
return;
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String name,
|
||||
Attributes atts) throws SAXException {
|
||||
if(localName.equals("td")) {
|
||||
localName = "span";
|
||||
name = "span";
|
||||
|
||||
inCell = true;
|
||||
if(needsComma) {
|
||||
super.characters(comma, 0, 1);
|
||||
needsComma = true;
|
||||
}
|
||||
int dataLength = value.length();
|
||||
if (dataLength == 0)
|
||||
{
|
||||
// nothing to do
|
||||
return;
|
||||
}
|
||||
super.startElement(uri, localName, name, atts);
|
||||
}
|
||||
|
||||
// escape the string
|
||||
StringBuilder sb = new StringBuilder(dataLength + 5); // slightly longer than the data
|
||||
for (int i = 0; i < dataLength; i++)
|
||||
{
|
||||
char currentChar = value.charAt(i);
|
||||
if (currentChar == '\"') // inverted commas
|
||||
{
|
||||
sb.append("\""); // CSV escaping of inverted commas
|
||||
}
|
||||
// append the char
|
||||
sb.append(currentChar);
|
||||
}
|
||||
// enclose in inverted commas for safety
|
||||
if (isData)
|
||||
{
|
||||
sb.insert(0, "\"");
|
||||
sb.append("\"");
|
||||
}
|
||||
// escaping complete
|
||||
value = sb.toString();
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String name)
|
||||
throws SAXException {
|
||||
if(localName.equals("td")) {
|
||||
localName = "span";
|
||||
name = "span";
|
||||
|
||||
byte[] bytes = null;
|
||||
if (encoding == null)
|
||||
{
|
||||
// use default encoding
|
||||
bytes = value.getBytes();
|
||||
needsComma = true;
|
||||
inCell = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
bytes = value.getBytes(encoding);
|
||||
if(localName.equals("tr")) {
|
||||
needsComma = false;
|
||||
}
|
||||
super.endElement(uri, localName, name);
|
||||
}
|
||||
// write to the stream
|
||||
os.write(bytes);
|
||||
// done
|
||||
}
|
||||
}
|
||||
|
@@ -22,8 +22,10 @@ import java.io.File;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.repo.content.filestore.FileContentReader;
|
||||
import org.alfresco.repo.content.filestore.FileContentWriter;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
|
||||
@@ -32,7 +34,7 @@ import org.alfresco.util.TempFileProvider;
|
||||
*
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public class PoiHssfContentTransformerTest extends AbstractContentTransformerTest
|
||||
public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformerTest
|
||||
{
|
||||
private ContentTransformer transformer;
|
||||
|
||||
@@ -56,12 +58,52 @@ public class PoiHssfContentTransformerTest extends AbstractContentTransformerTes
|
||||
{
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_EXCEL, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_CSV, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||
}
|
||||
|
||||
public void testCsvOutput() throws Exception
|
||||
{
|
||||
File sourceFile = AbstractContentTransformerTest.loadQuickTestFile("xls");
|
||||
ContentReader sourceReader = new FileContentReader(sourceFile);
|
||||
|
||||
File targetFile = TempFileProvider.createTempFile(
|
||||
getClass().getSimpleName() + "_" + getName() + "_xls_",
|
||||
".csv");
|
||||
ContentWriter targetWriter = new FileContentWriter(targetFile);
|
||||
|
||||
sourceReader.setMimetype(MimetypeMap.MIMETYPE_EXCEL);
|
||||
targetWriter.setMimetype(MimetypeMap.MIMETYPE_TEXT_CSV);
|
||||
transformer.transform(sourceReader, targetWriter);
|
||||
|
||||
ContentReader targetReader = targetWriter.getReader();
|
||||
String checkContent = targetReader.getContentString();
|
||||
System.err.println(checkContent);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void additionalContentCheck(String sourceMimetype,
|
||||
String targetMimetype, String contents) {
|
||||
if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
|
||||
System.err.println(contents);
|
||||
} else {
|
||||
super.additionalContentCheck(sourceMimetype, targetMimetype, contents);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean isQuickPhraseExpected(String targetMimetype) {
|
||||
if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
|
||||
return true;
|
||||
}
|
||||
return super.isQuickPhraseExpected(targetMimetype);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests a specific failure in the library
|
||||
*/
|
||||
public void xtestBugFixAR114() throws Exception
|
||||
public void xxtestBugFixAR114() throws Exception
|
||||
{
|
||||
File tempFile = TempFileProvider.createTempFile(
|
||||
getClass().getSimpleName() + "_" + getName() + "_",
|
||||
|
@@ -0,0 +1,95 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.Parser;
|
||||
|
||||
/**
|
||||
* A Content Extractor for XML, HTML and Text,
|
||||
* which makes use of the Apache Tika
|
||||
* auto-detection to select the best parser
|
||||
* to process your document.
|
||||
* This will be used for all files which Tika can
|
||||
* handle, but where no other more explicit
|
||||
* extractor is defined.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
|
||||
{
|
||||
/**
|
||||
* We support all the mimetypes that the Tika
|
||||
* auto-detect parser can handle, except for
|
||||
* Image, Audio and Video ones which don't
|
||||
* make much sense
|
||||
*/
|
||||
public static ArrayList<String> SUPPORTED_MIMETYPES;
|
||||
static {
|
||||
SUPPORTED_MIMETYPES = new ArrayList<String>();
|
||||
AutoDetectParser p = new AutoDetectParser();
|
||||
for(MediaType mt : p.getParsers().keySet()) {
|
||||
if(mt.toString().startsWith("application/vnd.oasis.opendocument.formula")) {
|
||||
// TODO Tika support for quick.odf, mimetype=application/vnd.oasis.opendocument.formula
|
||||
// TODO Tika support for quick.otf, mimetype=application/vnd.oasis.opendocument.formula-template
|
||||
continue;
|
||||
}
|
||||
if(mt.toString().startsWith("application/vnd.oasis.opendocument.graphics")) {
|
||||
// TODO Tika support for quick.odg, mimetype=application/vnd.oasis.opendocument.graphics
|
||||
// TODO Tika support for quick.otg, mimetype=application/vnd.oasis.opendocument.graphics-template
|
||||
continue;
|
||||
}
|
||||
|
||||
if(mt.getType().equals("image") ||
|
||||
mt.getType().equals("audio") ||
|
||||
mt.getType().equals("video") ||
|
||||
mt.toString().equals("application/zip") ||
|
||||
mt.toString().equals("application/tar"))
|
||||
{
|
||||
// Skip these, as Tika mostly just does
|
||||
// metadata rather than content
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
// Tika can probably do some useful text
|
||||
SUPPORTED_MIMETYPES.add( mt.toString() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public TikaAutoContentTransformer()
|
||||
{
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the Tika Auto-Detection
|
||||
* parser, which will try to
|
||||
* process all documents that Tika
|
||||
* knows about
|
||||
*/
|
||||
protected Parser getParser()
|
||||
{
|
||||
return new AutoDetectParser();
|
||||
}
|
||||
}
|
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
|
||||
/**
|
||||
* Most of the work for testing the Tika Auto-Detect transformer
|
||||
* is automatically done by {@link AbstractContentTransformerTest}
|
||||
*
|
||||
* @see org.alfresco.repo.content.transform.TikaAutoContentTransformer
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class TikaAutoContentTransformerTest extends TikaPoweredContentTransformerTest
|
||||
{
|
||||
private ContentTransformer transformer;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception
|
||||
{
|
||||
super.setUp();
|
||||
|
||||
transformer = new TikaAutoContentTransformer();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the same transformer regardless - it is allowed
|
||||
*/
|
||||
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
|
||||
{
|
||||
return transformer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure we picked up a mixture of content
|
||||
* types from Tika
|
||||
*/
|
||||
public void testIsTransformable() throws Exception
|
||||
{
|
||||
// Excel (but this isn't normally used)
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_EXCEL, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||
|
||||
// Word
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||
|
||||
// PDF
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PDF, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||
|
||||
// Open Office
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||
|
||||
// We don't do images
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||
// Ditto music
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||
}
|
||||
}
|
@@ -0,0 +1,192 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.TransformerConfigurationException;
|
||||
import javax.xml.transform.sax.SAXTransformerFactory;
|
||||
import javax.xml.transform.sax.TransformerHandler;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.xml.sax.ContentHandler;
|
||||
|
||||
/**
|
||||
* Provides helpful services for {@link org.alfresco.repo.content.transform.ContentTransformer}
|
||||
* implementations which are powered by Apache Tika.
|
||||
*
|
||||
* To use Tika to transform some content into Text, Html or XML, create an
|
||||
* implementation of this / use the Auto Detect transformer.
|
||||
*
|
||||
* For now, all transformers are registered as regular, rather than explicit
|
||||
* transformations. This should allow you to register your own explicit
|
||||
* transformers and have them nicely take priority.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public abstract class TikaPoweredContentTransformer extends AbstractContentTransformer2
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(TikaPoweredContentTransformer.class);
|
||||
protected List<String> sourceMimeTypes;
|
||||
|
||||
/**
|
||||
* Windows carriage return line feed pair.
|
||||
*/
|
||||
protected static final String LINE_BREAK = "\r\n";
|
||||
public static final String WRONG_FORMAT_MESSAGE_ID = "transform.err.format_or_password";
|
||||
|
||||
protected TikaPoweredContentTransformer(List<String> sourceMimeTypes) {
|
||||
this.sourceMimeTypes = sourceMimeTypes;
|
||||
}
|
||||
protected TikaPoweredContentTransformer(String[] sourceMimeTypes) {
|
||||
this(Arrays.asList(sourceMimeTypes));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the correct Tika Parser to process
|
||||
* the document.
|
||||
* If you don't know which you want, use
|
||||
* {@link TikaAutoContentTransformer} which
|
||||
* makes use of the Tika auto-detection.
|
||||
*/
|
||||
protected abstract Parser getParser();
|
||||
|
||||
/**
|
||||
* Can we do the requested transformation via Tika?
|
||||
* We support transforming to HTML, XML or Text
|
||||
*/
|
||||
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
|
||||
{
|
||||
if(! sourceMimeTypes.contains(sourceMimetype))
|
||||
{
|
||||
// The source isn't one of ours
|
||||
return false;
|
||||
}
|
||||
|
||||
if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype) ||
|
||||
MimetypeMap.MIMETYPE_HTML.equals(targetMimetype) ||
|
||||
MimetypeMap.MIMETYPE_XML.equals(targetMimetype))
|
||||
{
|
||||
// We can output to this
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// We support the source, but not the target
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an appropriate Tika ContentHandler for the
|
||||
* requested content type. Normally you'll let this
|
||||
* work as default, but if you need fine-grained
|
||||
* control of how the Tika events become text then
|
||||
* override and supply your own.
|
||||
*/
|
||||
protected ContentHandler getContentHandler(String targetMimeType, Writer output)
|
||||
throws TransformerConfigurationException
|
||||
{
|
||||
if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimeType))
|
||||
{
|
||||
return new BodyContentHandler(output);
|
||||
}
|
||||
|
||||
SAXTransformerFactory factory = (SAXTransformerFactory)
|
||||
SAXTransformerFactory.newInstance();
|
||||
TransformerHandler handler = factory.newTransformerHandler();
|
||||
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
handler.setResult(new StreamResult(output));
|
||||
|
||||
if(MimetypeMap.MIMETYPE_HTML.equals(targetMimeType))
|
||||
{
|
||||
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
|
||||
}
|
||||
else if(MimetypeMap.MIMETYPE_XML.equals(targetMimeType))
|
||||
{
|
||||
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new TransformerInfoException(
|
||||
WRONG_FORMAT_MESSAGE_ID,
|
||||
new IllegalArgumentException("Requested target type " + targetMimeType + " not supported")
|
||||
);
|
||||
}
|
||||
return handler;
|
||||
}
|
||||
|
||||
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
|
||||
throws Exception
|
||||
{
|
||||
InputStream is = reader.getContentInputStream();
|
||||
OutputStream os = writer.getContentOutputStream();
|
||||
String encoding = writer.getEncoding();
|
||||
String targetMimeType = writer.getMimetype();
|
||||
|
||||
Writer ow = new OutputStreamWriter(os, encoding);
|
||||
|
||||
Parser parser = getParser();
|
||||
Metadata metadata = new Metadata();
|
||||
ParseContext context = new ParseContext();
|
||||
|
||||
ContentHandler handler = getContentHandler(targetMimeType, ow);
|
||||
if(handler == null)
|
||||
{
|
||||
throw new TransformerConfigurationException(
|
||||
"Unable to create Tika Handler for configured output " + targetMimeType
|
||||
);
|
||||
}
|
||||
|
||||
try {
|
||||
parser.parse(is, handler, metadata, context);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (Throwable e) {}
|
||||
}
|
||||
if (ow != null)
|
||||
{
|
||||
try { ow.close(); } catch (Throwable e) {}
|
||||
}
|
||||
if (os != null)
|
||||
{
|
||||
try { os.close(); } catch (Throwable e) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
|
||||
/**
|
||||
* Parent test for Tika powered transformer tests
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public abstract class TikaPoweredContentTransformerTest extends AbstractContentTransformerTest
|
||||
{
|
||||
protected boolean isQuickPhraseExpected(String targetMimetype)
|
||||
{
|
||||
return (
|
||||
targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN) ||
|
||||
targetMimetype.equals(MimetypeMap.MIMETYPE_HTML) ||
|
||||
targetMimetype.equals(MimetypeMap.MIMETYPE_XML)
|
||||
);
|
||||
}
|
||||
protected boolean isQuickWordsExpected(String targetMimetype)
|
||||
{
|
||||
return (
|
||||
targetMimetype.startsWith(StringExtractingContentTransformer.PREFIX_TEXT) ||
|
||||
targetMimetype.equals(MimetypeMap.MIMETYPE_HTML) ||
|
||||
targetMimetype.equals(MimetypeMap.MIMETYPE_XML)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests for html vs xml vs plain text
|
||||
*/
|
||||
protected void additionalContentCheck(String sourceMimetype, String targetMimetype, String contents)
|
||||
{
|
||||
if(targetMimetype.equals(MimetypeMap.MIMETYPE_XML))
|
||||
{
|
||||
// Look for header and footer to confirm it was translated
|
||||
assertTrue(
|
||||
"XML header not found",
|
||||
contents.contains("<?xml version=")
|
||||
);
|
||||
assertTrue(
|
||||
"XHTML header not found",
|
||||
contents.contains("<html")
|
||||
);
|
||||
assertTrue(
|
||||
"XHTML footer not found",
|
||||
contents.contains("</html>")
|
||||
);
|
||||
}
|
||||
else if(targetMimetype.equals(MimetypeMap.MIMETYPE_HTML))
|
||||
{
|
||||
// Look for header and footer to confirm it was translated
|
||||
assertFalse(
|
||||
"XML header found but shouldn't be there for HTML",
|
||||
contents.contains("<?xml version=")
|
||||
);
|
||||
assertTrue(
|
||||
"HTML header not found",
|
||||
contents.contains("<html")
|
||||
);
|
||||
assertTrue(
|
||||
"HTML footer not found",
|
||||
contents.contains("</html>")
|
||||
);
|
||||
}
|
||||
else if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN))
|
||||
{
|
||||
// Ensure it really is plain text not xml/html
|
||||
assertFalse(
|
||||
"XML header found but shouldn't be there for Plain Text",
|
||||
contents.contains("<?xml version=")
|
||||
);
|
||||
assertFalse(
|
||||
"XHTML header found but shouldn't be there for Plain Text",
|
||||
contents.contains("<html")
|
||||
);
|
||||
assertFalse(
|
||||
"XHTML footer found but shouldn't be there for Plain Text",
|
||||
contents.contains("</html>")
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user