Initial Tika support for Text content transforms

The POI HSSF transformer has been updated to use Tika. A Tika auto-detect
 transformer has also been added, which caters for a large number of 
 previously un-handled cases. Unit tests check this.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20769 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-06-23 11:40:17 +00:00
parent 4ccc015f5f
commit f3a7a0aa7c
10 changed files with 670 additions and 233 deletions

View File

@@ -363,6 +363,10 @@
</property>
</bean>
<bean id="transformer.TikaAuto"
class="org.alfresco.repo.content.transform.TikaAutoContentTransformer"
parent="baseContentTransformer" />
<bean id="transformer.Poi"
class="org.alfresco.repo.content.transform.PoiHssfContentTransformer"
parent="baseContentTransformer" />

View File

@@ -28,6 +28,7 @@ import org.alfresco.repo.content.metadata.OpenOfficeMetadataExtracterTest;
import org.alfresco.repo.content.metadata.PdfBoxMetadataExtracterTest;
import org.alfresco.repo.content.metadata.PoiMetadataExtracterTest;
import org.alfresco.repo.content.metadata.RFC822MetadataExtracterTest;
import org.alfresco.repo.content.metadata.TikaAutoMetadataExtracterTest;
import org.alfresco.repo.content.transform.BinaryPassThroughContentTransformerTest;
import org.alfresco.repo.content.transform.ComplexContentTransformerTest;
import org.alfresco.repo.content.transform.ContentTransformerRegistryTest;
@@ -41,6 +42,7 @@ import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTe
import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
import org.alfresco.repo.content.transform.TextMiningContentTransformerTest;
import org.alfresco.repo.content.transform.TextToPdfContentTransformerTest;
import org.alfresco.repo.content.transform.TikaAutoContentTransformerTest;
import org.alfresco.repo.content.transform.magick.ImageMagickContentTransformerTest;
import org.alfresco.util.ApplicationContextHelper;
import org.springframework.context.ApplicationContext;
@@ -91,6 +93,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
suite.addTestSuite( PdfBoxMetadataExtracterTest.class );
suite.addTestSuite( PoiMetadataExtracterTest.class );
suite.addTestSuite( RFC822MetadataExtracterTest.class );
suite.addTestSuite( TikaAutoMetadataExtracterTest.class );
// Transform tests
suite.addTestSuite(BinaryPassThroughContentTransformerTest.class);
@@ -106,6 +109,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
suite.addTestSuite(StringExtractingContentTransformerTest.class);
suite.addTestSuite(TextMiningContentTransformerTest.class);
suite.addTestSuite(TextToPdfContentTransformerTest.class);
suite.addTestSuite(TikaAutoContentTransformerTest.class);
suite.addTestSuite(ImageMagickContentTransformerTest.class);
return suite;

View File

@@ -52,6 +52,7 @@ public class MimetypeMap implements MimetypeService
public static final String MIMETYPE_TEXT_PLAIN = "text/plain";
public static final String MIMETYPE_TEXT_MEDIAWIKI = "text/mediawiki";
public static final String MIMETYPE_TEXT_CSS = "text/css";
public static final String MIMETYPE_TEXT_CSV = "text/csv";
public static final String MIMETYPE_TEXT_JAVASCRIPT = "text/javascript";
public static final String MIMETYPE_XML = "text/xml";
public static final String MIMETYPE_HTML = "text/html";

View File

@@ -238,6 +238,9 @@ public abstract class AbstractContentTransformerTest extends TestCase
" source: " + sourceReader + "\n" +
" target: " + targetWriter,
checkContent.contains(QUICK_CONTENT));
// Let subclasses do extra checks if they want
additionalContentCheck(sourceMimetype, targetMimetype, checkContent);
}
else if (isQuickWordsExpected(targetMimetype))
{
@@ -280,6 +283,13 @@ public abstract class AbstractContentTransformerTest extends TestCase
outputWriter.setEncoding("UTF8");
outputWriter.putContent(sb.toString());
}
/**
* Allows implementations to do some extra checks on the
* results of the content as found by
* {@link #testAllConversions()}
*/
protected void additionalContentCheck(String sourceMimetype, String targetMimetype, String contents) {}
/**
* This method is an extension point for enabling/disabling an assertion that the "quick brown fox"

View File

@@ -18,267 +18,164 @@
*/
package org.alfresco.repo.content.transform;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Writer;
import java.util.regex.Pattern;
import javax.xml.transform.TransformerConfigurationException;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.util.RecordFormatException;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Makes use of the {@link http://jakarta.apache.org/poi/ POI} library to
* perform conversions from Excel spreadsheets to text (comma separated).
* <p>
* While most text extraction from spreadsheets only extract the first sheet of
* the workbook, the method used here extracts the text from <b>all the sheets</b>.
* This is more useful, especially when it comes to indexing spreadsheets.
* <p>
* In the case where there is only one sheet in the document, the results will be
* exactly the same as most extractors. Where there are multiple sheets, the results
* will differ, but meaningful reimporting of the text document is not possible
* anyway.
* Uses {@link http://tika.apache.org/ Apache Tika} and
* {@link http://poi.apache.org/ Apache POI} to perform
* conversions from Excel spreadsheets.
* <p>Will transform from Excel spreadsheets into Html,
* Xml or Text (space or comma separated)
* <p>Handles all sheets in the file.
*
* TODO CSV Support
*
* @author Nick Burch
* @author Derek Hulley
*/
public class PoiHssfContentTransformer extends AbstractContentTransformer2
public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
{
/**
* Error message to delegate to NodeInfoBean
*/
public static final String WRONG_FORMAT_MESSAGE_ID = "transform.err.format_or_password";
/**
* Windows carriage return line feed pair.
*/
private static final String LINE_BREAK = "\r\n";
private static Log logger = LogFactory.getLog(PoiHssfContentTransformer.class);
public PoiHssfContentTransformer()
{
super(new String[] {
MimetypeMap.MIMETYPE_EXCEL
});
}
@Override
protected Parser getParser()
{
return new OfficeParser();
}
/**
* Currently the only transformation performed is that of text extraction from XLS documents.
* Can we do the requested transformation via Tika?
* We support transforming to HTML, XML, Text or CSV
*/
@Override
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
{
if (!MimetypeMap.MIMETYPE_EXCEL.equals(sourceMimetype) ||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
{
// only support XLS -> Text
return false;
}
else
{
return true;
}
if(sourceMimeTypes.contains(sourceMimetype) &&
MimetypeMap.MIMETYPE_TEXT_CSV.equals(targetMimetype))
{
// Special case for CSV
return true;
}
// Otherwise fall back on the default Tika rules
return super.isTransformable(sourceMimetype, targetMimetype, options);
}
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
throws Exception
@Override
protected ContentHandler getContentHandler(String targetMimeType, Writer output)
throws TransformerConfigurationException
{
InputStream is = reader.getContentInputStream();
OutputStream os = writer.getContentOutputStream();
String encoding = writer.getEncoding();
try
{
// open the workbook
HSSFWorkbook workbook = new HSSFWorkbook(is);
// how many sheets are there?
int sheetCount = workbook.getNumberOfSheets();
// transform each sheet
for (int i = 0; i < sheetCount; i++)
{
HSSFSheet sheet = workbook.getSheetAt(i);
String sheetName = workbook.getSheetName(i);
writeSheet(os, sheet, encoding);
// write the sheet name
PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
PoiHssfContentTransformer.writeString(os, encoding, "End of sheet: " + sheetName, true);
PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
}
}
catch (RecordFormatException ex)
{
// Catching specific exception to propagate it to NodeInfoBean
// to fix issue https://issues.alfresco.com/jira/browse/ETWOTWO-440
logger.error(ex);
throw new TransformerInfoException(WRONG_FORMAT_MESSAGE_ID, ex);
}
finally
{
if (is != null)
{
try { is.close(); } catch (Throwable e) {}
}
if (os != null)
{
try { os.close(); } catch (Throwable e) {}
}
}
if(MimetypeMap.MIMETYPE_TEXT_CSV.equals(targetMimeType))
{
return new CsvContentHandler(output);
}
// Otherwise use the normal Tika rules
return super.getContentHandler(targetMimeType, output);
}
/**
* Dumps the text from the sheet to the stream in CSV format
* A wrapper around the normal Tika BodyContentHandler,
* which causes things to be CSV encoded rather than
* tab separated
* TODO Get rid of the extra tabs that crop up
*/
private void writeSheet(OutputStream os, HSSFSheet sheet, String encoding) throws Exception
{
int rows = sheet.getLastRowNum();
// transform each row
for (int i = 0; i <= rows; i++)
{
HSSFRow row = sheet.getRow(i);
if (row != null)
{
writeRow(os, row, encoding);
}
// break between rows
if (i < rows)
{
PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
}
}
}
private void writeRow(OutputStream os, HSSFRow row, String encoding) throws Exception
{
short firstCellNum = row.getFirstCellNum();
short lastCellNum = row.getLastCellNum();
// pad out to first cell
for (int i = 0; i < firstCellNum; i++)
{
PoiHssfContentTransformer.writeString(os, encoding, ",", false); // CSV up to first cell
}
// write each cell
for (int i = 0; i <= lastCellNum; i++)
{
HSSFCell cell = row.getCell(i);
if (cell != null)
{
int cellType = cell.getCellType();
protected static class CsvContentHandler extends BodyContentHandler {
private static final char[] comma = new char[]{ ',' };
private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+");
private boolean inCell = false;
private boolean needsComma = false;
protected CsvContentHandler(Writer output) {
super(output);
}
StringBuilder sb = new StringBuilder(10);
switch (cellType)
{
case HSSFCell.CELL_TYPE_BLANK:
// ignore
break;
case HSSFCell.CELL_TYPE_BOOLEAN:
sb.append(cell.getBooleanCellValue());
break;
case HSSFCell.CELL_TYPE_ERROR:
sb.append("ERROR");
break;
case HSSFCell.CELL_TYPE_NUMERIC:
sb.append(cell.getNumericCellValue());
break;
case HSSFCell.CELL_TYPE_STRING:
sb.append(cell.getStringCellValue());
break;
case HSSFCell.CELL_TYPE_FORMULA:
final int formulaResultType = cell.getCachedFormulaResultType();
if (HSSFCell.CELL_TYPE_NUMERIC == formulaResultType)
{
sb.append(cell.getNumericCellValue());
}
else if (HSSFCell.CELL_TYPE_STRING == formulaResultType)
{
sb.append(cell.getStringCellValue());
}
else if (HSSFCell.CELL_TYPE_BOOLEAN == formulaResultType)
{
sb.append(cell.getBooleanCellValue());
}
else if (HSSFCell.CELL_TYPE_ERROR == formulaResultType)
{
sb.append(cell.getErrorCellValue());
}
else
{
throw new RuntimeException("Unknown formula result type: " + formulaResultType);
}
break;
default:
throw new RuntimeException("Unknown HSSF cell type: " + cell);
}
String data = sb.toString();
PoiHssfContentTransformer.writeString(os, encoding, data, true);
}
// comma separate if required
if (i < lastCellNum)
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
if(inCell) {
StringBuffer t = new StringBuffer(new String(ch,start,length));
// Quote if not all numbers
if(all_nums.matcher(t).matches())
{
PoiHssfContentTransformer.writeString(os, encoding, ",", false);
super.characters(ch, start, length);
}
}
}
/**
* Writes the given data to the stream using the encoding specified. If the encoding
* is not given, the default <tt>String</tt> to <tt>byte[]</tt> conversion will be
* used.
* <p>
* The given data string will be escaped appropriately.
*
* @param os the stream to write to
* @param encoding the encoding to use, or null if the default encoding is acceptable
* @param value the string to write
* @param isData true if the value represents a human-readable string, false if the
* value represents formatting characters, separating characters, etc.
* @throws Exception
*/
public static void writeString(OutputStream os, String encoding, String value, boolean isData) throws Exception
{
if (value == null)
{
// nothing to do
return;
}
int dataLength = value.length();
if (dataLength == 0)
{
// nothing to do
return;
}
// escape the string
StringBuilder sb = new StringBuilder(dataLength + 5); // slightly longer than the data
for (int i = 0; i < dataLength; i++)
{
char currentChar = value.charAt(i);
if (currentChar == '\"') // inverted commas
else
{
sb.append("\""); // CSV escaping of inverted commas
for(int i=t.length()-1; i>=0; i--) {
if(t.charAt(i) == '\"') {
// Double up double quotes
t.insert(i, '\"');
i--;
}
}
t.insert(0, '\"');
t.append('\"');
char[] c = t.toString().toCharArray();
super.characters(c, 0, c.length);
}
// append the char
sb.append(currentChar);
}
// enclose in inverted commas for safety
if (isData)
{
sb.insert(0, "\"");
sb.append("\"");
}
// escaping complete
value = sb.toString();
byte[] bytes = null;
if (encoding == null)
{
// use default encoding
bytes = value.getBytes();
}
else
{
bytes = value.getBytes(encoding);
}
// write to the stream
os.write(bytes);
// done
} else {
super.characters(ch, start, length);
}
}
@Override
public void startElement(String uri, String localName, String name,
Attributes atts) throws SAXException {
if(localName.equals("td")) {
localName = "span";
name = "span";
inCell = true;
if(needsComma) {
super.characters(comma, 0, 1);
needsComma = true;
}
}
super.startElement(uri, localName, name, atts);
}
@Override
public void endElement(String uri, String localName, String name)
throws SAXException {
if(localName.equals("td")) {
localName = "span";
name = "span";
needsComma = true;
inCell = false;
}
if(localName.equals("tr")) {
needsComma = false;
}
super.endElement(uri, localName, name);
}
}
}

View File

@@ -22,8 +22,10 @@ import java.io.File;
import java.io.InputStream;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.filestore.FileContentReader;
import org.alfresco.repo.content.filestore.FileContentWriter;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.alfresco.util.TempFileProvider;
@@ -32,7 +34,7 @@ import org.alfresco.util.TempFileProvider;
*
* @author Derek Hulley
*/
public class PoiHssfContentTransformerTest extends AbstractContentTransformerTest
public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformerTest
{
private ContentTransformer transformer;
@@ -56,12 +58,52 @@ public class PoiHssfContentTransformerTest extends AbstractContentTransformerTes
{
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_EXCEL, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_CSV, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
}
/**
public void testCsvOutput() throws Exception
{
File sourceFile = AbstractContentTransformerTest.loadQuickTestFile("xls");
ContentReader sourceReader = new FileContentReader(sourceFile);
File targetFile = TempFileProvider.createTempFile(
getClass().getSimpleName() + "_" + getName() + "_xls_",
".csv");
ContentWriter targetWriter = new FileContentWriter(targetFile);
sourceReader.setMimetype(MimetypeMap.MIMETYPE_EXCEL);
targetWriter.setMimetype(MimetypeMap.MIMETYPE_TEXT_CSV);
transformer.transform(sourceReader, targetWriter);
ContentReader targetReader = targetWriter.getReader();
String checkContent = targetReader.getContentString();
System.err.println(checkContent);
}
@Override
protected void additionalContentCheck(String sourceMimetype,
String targetMimetype, String contents) {
if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
System.err.println(contents);
} else {
super.additionalContentCheck(sourceMimetype, targetMimetype, contents);
}
}
@Override
protected boolean isQuickPhraseExpected(String targetMimetype) {
if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
return true;
}
return super.isQuickPhraseExpected(targetMimetype);
}
/**
* Tests a specific failure in the library
*/
public void xtestBugFixAR114() throws Exception
public void xxtestBugFixAR114() throws Exception
{
File tempFile = TempFileProvider.createTempFile(
getClass().getSimpleName() + "_" + getName() + "_",

View File

@@ -0,0 +1,95 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import java.util.ArrayList;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
/**
* A Content Extractor for XML, HTML and Text,
* which makes use of the Apache Tika
* auto-detection to select the best parser
* to process your document.
* This will be used for all files which Tika can
* handle, but where no other more explicit
* extractor is defined.
*
* @author Nick Burch
*/
public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
{
/**
* We support all the mimetypes that the Tika
* auto-detect parser can handle, except for
* Image, Audio and Video ones which don't
* make much sense
*/
public static ArrayList<String> SUPPORTED_MIMETYPES;
static {
SUPPORTED_MIMETYPES = new ArrayList<String>();
AutoDetectParser p = new AutoDetectParser();
for(MediaType mt : p.getParsers().keySet()) {
if(mt.toString().startsWith("application/vnd.oasis.opendocument.formula")) {
// TODO Tika support for quick.odf, mimetype=application/vnd.oasis.opendocument.formula
// TODO Tika support for quick.otf, mimetype=application/vnd.oasis.opendocument.formula-template
continue;
}
if(mt.toString().startsWith("application/vnd.oasis.opendocument.graphics")) {
// TODO Tika support for quick.odg, mimetype=application/vnd.oasis.opendocument.graphics
// TODO Tika support for quick.otg, mimetype=application/vnd.oasis.opendocument.graphics-template
continue;
}
if(mt.getType().equals("image") ||
mt.getType().equals("audio") ||
mt.getType().equals("video") ||
mt.toString().equals("application/zip") ||
mt.toString().equals("application/tar"))
{
// Skip these, as Tika mostly just does
// metadata rather than content
}
else
{
// Tika can probably do some useful text
SUPPORTED_MIMETYPES.add( mt.toString() );
}
}
}
public TikaAutoContentTransformer()
{
super(SUPPORTED_MIMETYPES);
}
/**
* Returns the Tika Auto-Detection
* parser, which will try to
* process all documents that Tika
* knows about
*/
protected Parser getParser()
{
return new AutoDetectParser();
}
}

View File

@@ -0,0 +1,91 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.TransformationOptions;
/**
* Most of the work for testing the Tika Auto-Detect transformer
* is automatically done by {@link AbstractContentTransformerTest}
*
* @see org.alfresco.repo.content.transform.TikaAutoContentTransformer
*
* @author Nick Burch
*/
public class TikaAutoContentTransformerTest extends TikaPoweredContentTransformerTest
{
private ContentTransformer transformer;
@Override
public void setUp() throws Exception
{
super.setUp();
transformer = new TikaAutoContentTransformer();
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
{
return transformer;
}
/**
* Ensure we picked up a mixture of content
* types from Tika
*/
public void testIsTransformable() throws Exception
{
// Excel (but this isn't normally used)
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_EXCEL, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
// Word
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
// PDF
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PDF, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
// Open Office
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
// We don't do images
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
// Ditto music
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
}
}

View File

@@ -0,0 +1,192 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.List;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
/**
* Provides helpful services for {@link org.alfresco.repo.content.transform.ContentTransformer}
* implementations which are powered by Apache Tika.
*
* To use Tika to transform some content into Text, Html or XML, create an
* implementation of this / use the Auto Detect transformer.
*
* For now, all transformers are registered as regular, rather than explicit
* transformations. This should allow you to register your own explicit
* transformers and have them nicely take priority.
*
* @author Nick Burch
*/
public abstract class TikaPoweredContentTransformer extends AbstractContentTransformer2
{
private static final Log logger = LogFactory.getLog(TikaPoweredContentTransformer.class);
protected List<String> sourceMimeTypes;
/**
* Windows carriage return line feed pair.
*/
protected static final String LINE_BREAK = "\r\n";
public static final String WRONG_FORMAT_MESSAGE_ID = "transform.err.format_or_password";
protected TikaPoweredContentTransformer(List<String> sourceMimeTypes) {
this.sourceMimeTypes = sourceMimeTypes;
}
protected TikaPoweredContentTransformer(String[] sourceMimeTypes) {
this(Arrays.asList(sourceMimeTypes));
}
/**
* Returns the correct Tika Parser to process
* the document.
* If you don't know which you want, use
* {@link TikaAutoContentTransformer} which
* makes use of the Tika auto-detection.
*/
protected abstract Parser getParser();
/**
* Can we do the requested transformation via Tika?
* We support transforming to HTML, XML or Text
*/
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
{
if(! sourceMimeTypes.contains(sourceMimetype))
{
// The source isn't one of ours
return false;
}
if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype) ||
MimetypeMap.MIMETYPE_HTML.equals(targetMimetype) ||
MimetypeMap.MIMETYPE_XML.equals(targetMimetype))
{
// We can output to this
return true;
}
else
{
// We support the source, but not the target
return false;
}
}
/**
* Returns an appropriate Tika ContentHandler for the
* requested content type. Normally you'll let this
* work as default, but if you need fine-grained
* control of how the Tika events become text then
* override and supply your own.
*/
protected ContentHandler getContentHandler(String targetMimeType, Writer output)
throws TransformerConfigurationException
{
if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimeType))
{
return new BodyContentHandler(output);
}
SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(output));
if(MimetypeMap.MIMETYPE_HTML.equals(targetMimeType))
{
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
}
else if(MimetypeMap.MIMETYPE_XML.equals(targetMimeType))
{
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
}
else
{
throw new TransformerInfoException(
WRONG_FORMAT_MESSAGE_ID,
new IllegalArgumentException("Requested target type " + targetMimeType + " not supported")
);
}
return handler;
}
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
throws Exception
{
InputStream is = reader.getContentInputStream();
OutputStream os = writer.getContentOutputStream();
String encoding = writer.getEncoding();
String targetMimeType = writer.getMimetype();
Writer ow = new OutputStreamWriter(os, encoding);
Parser parser = getParser();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
ContentHandler handler = getContentHandler(targetMimeType, ow);
if(handler == null)
{
throw new TransformerConfigurationException(
"Unable to create Tika Handler for configured output " + targetMimeType
);
}
try {
parser.parse(is, handler, metadata, context);
}
finally
{
if (is != null)
{
try { is.close(); } catch (Throwable e) {}
}
if (ow != null)
{
try { ow.close(); } catch (Throwable e) {}
}
if (os != null)
{
try { os.close(); } catch (Throwable e) {}
}
}
}
}

View File

@@ -0,0 +1,101 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import org.alfresco.repo.content.MimetypeMap;
/**
* Parent test for Tika powered transformer tests
*
* @author Nick Burch
*/
public abstract class TikaPoweredContentTransformerTest extends AbstractContentTransformerTest
{
protected boolean isQuickPhraseExpected(String targetMimetype)
{
return (
targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN) ||
targetMimetype.equals(MimetypeMap.MIMETYPE_HTML) ||
targetMimetype.equals(MimetypeMap.MIMETYPE_XML)
);
}
protected boolean isQuickWordsExpected(String targetMimetype)
{
return (
targetMimetype.startsWith(StringExtractingContentTransformer.PREFIX_TEXT) ||
targetMimetype.equals(MimetypeMap.MIMETYPE_HTML) ||
targetMimetype.equals(MimetypeMap.MIMETYPE_XML)
);
}
/**
* Tests for html vs xml vs plain text
*/
protected void additionalContentCheck(String sourceMimetype, String targetMimetype, String contents)
{
if(targetMimetype.equals(MimetypeMap.MIMETYPE_XML))
{
// Look for header and footer to confirm it was translated
assertTrue(
"XML header not found",
contents.contains("<?xml version=")
);
assertTrue(
"XHTML header not found",
contents.contains("<html")
);
assertTrue(
"XHTML footer not found",
contents.contains("</html>")
);
}
else if(targetMimetype.equals(MimetypeMap.MIMETYPE_HTML))
{
// Look for header and footer to confirm it was translated
assertFalse(
"XML header found but shouldn't be there for HTML",
contents.contains("<?xml version=")
);
assertTrue(
"HTML header not found",
contents.contains("<html")
);
assertTrue(
"HTML footer not found",
contents.contains("</html>")
);
}
else if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN))
{
// Ensure it really is plain text not xml/html
assertFalse(
"XML header found but shouldn't be there for Plain Text",
contents.contains("<?xml version=")
);
assertFalse(
"XHTML header found but shouldn't be there for Plain Text",
contents.contains("<html")
);
assertFalse(
"XHTML footer found but shouldn't be there for Plain Text",
contents.contains("</html>")
);
}
}
}