From f3a7a0aa7c6f817f7b403d6cd8ddf8ba12ddc13e Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Wed, 23 Jun 2010 11:40:17 +0000 Subject: [PATCH] Initial Tika support for Text content transforms The POI HSSF transformer has been updated to use Tika. A Tika auto-detect transformer has also been added, which caters for a large number of previously un-handled cases. Unit tests check this. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20769 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- config/alfresco/content-services-context.xml | 4 + .../ContentMinimalContextTestSuite.java | 4 + .../alfresco/repo/content/MimetypeMap.java | 1 + .../AbstractContentTransformerTest.java | 10 + .../transform/PoiHssfContentTransformer.java | 357 +++++++----------- .../PoiHssfContentTransformerTest.java | 48 ++- .../transform/TikaAutoContentTransformer.java | 95 +++++ .../TikaAutoContentTransformerTest.java | 91 +++++ .../TikaPoweredContentTransformer.java | 192 ++++++++++ .../TikaPoweredContentTransformerTest.java | 101 +++++ 10 files changed, 670 insertions(+), 233 deletions(-) create mode 100644 source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformer.java create mode 100644 source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformerTest.java create mode 100644 source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java create mode 100644 source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformerTest.java diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml index 21b5414923..06cdb84395 100644 --- a/config/alfresco/content-services-context.xml +++ b/config/alfresco/content-services-context.xml @@ -363,6 +363,10 @@ + + diff --git a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java index bdcfce573d..30f44844e2 100644 --- a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java +++ b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java @@ -28,6 +28,7 @@ import org.alfresco.repo.content.metadata.OpenOfficeMetadataExtracterTest; import org.alfresco.repo.content.metadata.PdfBoxMetadataExtracterTest; import org.alfresco.repo.content.metadata.PoiMetadataExtracterTest; import org.alfresco.repo.content.metadata.RFC822MetadataExtracterTest; +import org.alfresco.repo.content.metadata.TikaAutoMetadataExtracterTest; import org.alfresco.repo.content.transform.BinaryPassThroughContentTransformerTest; import org.alfresco.repo.content.transform.ComplexContentTransformerTest; import org.alfresco.repo.content.transform.ContentTransformerRegistryTest; @@ -41,6 +42,7 @@ import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTe import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest; import org.alfresco.repo.content.transform.TextMiningContentTransformerTest; import org.alfresco.repo.content.transform.TextToPdfContentTransformerTest; +import org.alfresco.repo.content.transform.TikaAutoContentTransformerTest; import org.alfresco.repo.content.transform.magick.ImageMagickContentTransformerTest; import org.alfresco.util.ApplicationContextHelper; import org.springframework.context.ApplicationContext; @@ -91,6 +93,7 @@ public class ContentMinimalContextTestSuite extends TestSuite suite.addTestSuite( PdfBoxMetadataExtracterTest.class ); suite.addTestSuite( PoiMetadataExtracterTest.class ); suite.addTestSuite( RFC822MetadataExtracterTest.class ); + suite.addTestSuite( TikaAutoMetadataExtracterTest.class ); // Transform tests suite.addTestSuite(BinaryPassThroughContentTransformerTest.class); @@ -106,6 +109,7 @@ public class ContentMinimalContextTestSuite extends TestSuite suite.addTestSuite(StringExtractingContentTransformerTest.class); suite.addTestSuite(TextMiningContentTransformerTest.class); suite.addTestSuite(TextToPdfContentTransformerTest.class); + suite.addTestSuite(TikaAutoContentTransformerTest.class); suite.addTestSuite(ImageMagickContentTransformerTest.class); return suite; diff --git a/source/java/org/alfresco/repo/content/MimetypeMap.java b/source/java/org/alfresco/repo/content/MimetypeMap.java index d5490136f9..947a3201df 100644 --- a/source/java/org/alfresco/repo/content/MimetypeMap.java +++ b/source/java/org/alfresco/repo/content/MimetypeMap.java @@ -52,6 +52,7 @@ public class MimetypeMap implements MimetypeService public static final String MIMETYPE_TEXT_PLAIN = "text/plain"; public static final String MIMETYPE_TEXT_MEDIAWIKI = "text/mediawiki"; public static final String MIMETYPE_TEXT_CSS = "text/css"; + public static final String MIMETYPE_TEXT_CSV = "text/csv"; public static final String MIMETYPE_TEXT_JAVASCRIPT = "text/javascript"; public static final String MIMETYPE_XML = "text/xml"; public static final String MIMETYPE_HTML = "text/html"; diff --git a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java index 5e7dad0c6b..c6c02c3f99 100644 --- a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java +++ b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java @@ -238,6 +238,9 @@ public abstract class AbstractContentTransformerTest extends TestCase " source: " + sourceReader + "\n" + " target: " + targetWriter, checkContent.contains(QUICK_CONTENT)); + + // Let subclasses do extra checks if they want + additionalContentCheck(sourceMimetype, targetMimetype, checkContent); } else if (isQuickWordsExpected(targetMimetype)) { @@ -280,6 +283,13 @@ public abstract class AbstractContentTransformerTest extends TestCase outputWriter.setEncoding("UTF8"); outputWriter.putContent(sb.toString()); } + + /** + * Allows implementations to do some extra checks on the + * results of the content as found by + * {@link #testAllConversions()} + */ + protected void additionalContentCheck(String sourceMimetype, String targetMimetype, String contents) {} /** * This method is an extension point for enabling/disabling an assertion that the "quick brown fox" diff --git a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java index c5bc7f959f..5bc453e5c0 100644 --- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java @@ -18,267 +18,164 @@ */ package org.alfresco.repo.content.transform; -import java.io.InputStream; -import java.io.OutputStream; +import java.io.Writer; +import java.util.regex.Pattern; + +import javax.xml.transform.TransformerConfigurationException; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentReader; -import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.TransformationOptions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.poi.hssf.usermodel.HSSFCell; -import org.apache.poi.hssf.usermodel.HSSFRow; -import org.apache.poi.hssf.usermodel.HSSFSheet; -import org.apache.poi.hssf.usermodel.HSSFWorkbook; -import org.apache.poi.util.RecordFormatException; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.OfficeParser; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Makes use of the {@link http://jakarta.apache.org/poi/ POI} library to - * perform conversions from Excel spreadsheets to text (comma separated). - *

- * While most text extraction from spreadsheets only extract the first sheet of - * the workbook, the method used here extracts the text from all the sheets. - * This is more useful, especially when it comes to indexing spreadsheets. - *

- * In the case where there is only one sheet in the document, the results will be - * exactly the same as most extractors. Where there are multiple sheets, the results - * will differ, but meaningful reimporting of the text document is not possible - * anyway. + * Uses {@link http://tika.apache.org/ Apache Tika} and + * {@link http://poi.apache.org/ Apache POI} to perform + * conversions from Excel spreadsheets. + *

Will transform from Excel spreadsheets into Html, + * Xml or Text (space or comma separated) + *

Handles all sheets in the file. * + * TODO CSV Support + * + * @author Nick Burch * @author Derek Hulley */ -public class PoiHssfContentTransformer extends AbstractContentTransformer2 +public class PoiHssfContentTransformer extends TikaPoweredContentTransformer { /** * Error message to delegate to NodeInfoBean */ public static final String WRONG_FORMAT_MESSAGE_ID = "transform.err.format_or_password"; - - /** - * Windows carriage return line feed pair. - */ - private static final String LINE_BREAK = "\r\n"; private static Log logger = LogFactory.getLog(PoiHssfContentTransformer.class); + public PoiHssfContentTransformer() + { + super(new String[] { + MimetypeMap.MIMETYPE_EXCEL + }); + } + + @Override + protected Parser getParser() + { + return new OfficeParser(); + } + /** - * Currently the only transformation performed is that of text extraction from XLS documents. + * Can we do the requested transformation via Tika? + * We support transforming to HTML, XML, Text or CSV */ + @Override public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options) { - if (!MimetypeMap.MIMETYPE_EXCEL.equals(sourceMimetype) || - !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype)) - { - // only support XLS -> Text - return false; - } - else - { - return true; - } + if(sourceMimeTypes.contains(sourceMimetype) && + MimetypeMap.MIMETYPE_TEXT_CSV.equals(targetMimetype)) + { + // Special case for CSV + return true; + } + + // Otherwise fall back on the default Tika rules + return super.isTransformable(sourceMimetype, targetMimetype, options); } - - public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options) - throws Exception + + @Override + protected ContentHandler getContentHandler(String targetMimeType, Writer output) + throws TransformerConfigurationException { - InputStream is = reader.getContentInputStream(); - OutputStream os = writer.getContentOutputStream(); - String encoding = writer.getEncoding(); - try - { - // open the workbook - HSSFWorkbook workbook = new HSSFWorkbook(is); - // how many sheets are there? - int sheetCount = workbook.getNumberOfSheets(); - // transform each sheet - for (int i = 0; i < sheetCount; i++) - { - HSSFSheet sheet = workbook.getSheetAt(i); - String sheetName = workbook.getSheetName(i); - writeSheet(os, sheet, encoding); - // write the sheet name - PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false); - PoiHssfContentTransformer.writeString(os, encoding, "End of sheet: " + sheetName, true); - PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false); - PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false); - } - } - catch (RecordFormatException ex) - { - // Catching specific exception to propagate it to NodeInfoBean - // to fix issue https://issues.alfresco.com/jira/browse/ETWOTWO-440 - - logger.error(ex); - throw new TransformerInfoException(WRONG_FORMAT_MESSAGE_ID, ex); - } - finally - { - if (is != null) - { - try { is.close(); } catch (Throwable e) {} - } - if (os != null) - { - try { os.close(); } catch (Throwable e) {} - } - } + if(MimetypeMap.MIMETYPE_TEXT_CSV.equals(targetMimeType)) + { + return new CsvContentHandler(output); + } + + // Otherwise use the normal Tika rules + return super.getContentHandler(targetMimeType, output); } /** - * Dumps the text from the sheet to the stream in CSV format + * A wrapper around the normal Tika BodyContentHandler, + * which causes things to be CSV encoded rather than + * tab separated + * TODO Get rid of the extra tabs that crop up */ - private void writeSheet(OutputStream os, HSSFSheet sheet, String encoding) throws Exception - { - int rows = sheet.getLastRowNum(); - // transform each row - for (int i = 0; i <= rows; i++) - { - HSSFRow row = sheet.getRow(i); - if (row != null) - { - writeRow(os, row, encoding); - } - // break between rows - if (i < rows) - { - PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false); - } - } - } - - private void writeRow(OutputStream os, HSSFRow row, String encoding) throws Exception - { - short firstCellNum = row.getFirstCellNum(); - short lastCellNum = row.getLastCellNum(); - // pad out to first cell - for (int i = 0; i < firstCellNum; i++) - { - PoiHssfContentTransformer.writeString(os, encoding, ",", false); // CSV up to first cell - } - // write each cell - for (int i = 0; i <= lastCellNum; i++) - { - HSSFCell cell = row.getCell(i); - if (cell != null) - { - int cellType = cell.getCellType(); + protected static class CsvContentHandler extends BodyContentHandler { + private static final char[] comma = new char[]{ ',' }; + private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+"); + + private boolean inCell = false; + private boolean needsComma = false; + + protected CsvContentHandler(Writer output) { + super(output); + } - StringBuilder sb = new StringBuilder(10); - switch (cellType) - { - case HSSFCell.CELL_TYPE_BLANK: - // ignore - break; - case HSSFCell.CELL_TYPE_BOOLEAN: - sb.append(cell.getBooleanCellValue()); - break; - case HSSFCell.CELL_TYPE_ERROR: - sb.append("ERROR"); - break; - case HSSFCell.CELL_TYPE_NUMERIC: - sb.append(cell.getNumericCellValue()); - break; - case HSSFCell.CELL_TYPE_STRING: - sb.append(cell.getStringCellValue()); - break; - case HSSFCell.CELL_TYPE_FORMULA: - final int formulaResultType = cell.getCachedFormulaResultType(); - if (HSSFCell.CELL_TYPE_NUMERIC == formulaResultType) - { - sb.append(cell.getNumericCellValue()); - } - else if (HSSFCell.CELL_TYPE_STRING == formulaResultType) - { - sb.append(cell.getStringCellValue()); - } - else if (HSSFCell.CELL_TYPE_BOOLEAN == formulaResultType) - { - sb.append(cell.getBooleanCellValue()); - } - else if (HSSFCell.CELL_TYPE_ERROR == formulaResultType) - { - sb.append(cell.getErrorCellValue()); - } - else - { - throw new RuntimeException("Unknown formula result type: " + formulaResultType); - } - break; - default: - throw new RuntimeException("Unknown HSSF cell type: " + cell); - } - String data = sb.toString(); - PoiHssfContentTransformer.writeString(os, encoding, data, true); - } - // comma separate if required - if (i < lastCellNum) + @Override + public void characters(char[] ch, int start, int length) + throws SAXException { + if(inCell) { + StringBuffer t = new StringBuffer(new String(ch,start,length)); + + // Quote if not all numbers + if(all_nums.matcher(t).matches()) { - PoiHssfContentTransformer.writeString(os, encoding, ",", false); + super.characters(ch, start, length); } - } - } - - /** - * Writes the given data to the stream using the encoding specified. If the encoding - * is not given, the default String to byte[] conversion will be - * used. - *

- * The given data string will be escaped appropriately. - * - * @param os the stream to write to - * @param encoding the encoding to use, or null if the default encoding is acceptable - * @param value the string to write - * @param isData true if the value represents a human-readable string, false if the - * value represents formatting characters, separating characters, etc. - * @throws Exception - */ - public static void writeString(OutputStream os, String encoding, String value, boolean isData) throws Exception - { - if (value == null) - { - // nothing to do - return; - } - int dataLength = value.length(); - if (dataLength == 0) - { - // nothing to do - return; - } - - // escape the string - StringBuilder sb = new StringBuilder(dataLength + 5); // slightly longer than the data - for (int i = 0; i < dataLength; i++) - { - char currentChar = value.charAt(i); - if (currentChar == '\"') // inverted commas + else { - sb.append("\""); // CSV escaping of inverted commas + for(int i=t.length()-1; i>=0; i--) { + if(t.charAt(i) == '\"') { + // Double up double quotes + t.insert(i, '\"'); + i--; + } + } + t.insert(0, '\"'); + t.append('\"'); + char[] c = t.toString().toCharArray(); + super.characters(c, 0, c.length); } - // append the char - sb.append(currentChar); - } - // enclose in inverted commas for safety - if (isData) - { - sb.insert(0, "\""); - sb.append("\""); - } - // escaping complete - value = sb.toString(); - - byte[] bytes = null; - if (encoding == null) - { - // use default encoding - bytes = value.getBytes(); - } - else - { - bytes = value.getBytes(encoding); - } - // write to the stream - os.write(bytes); - // done + } else { + super.characters(ch, start, length); + } + } + + @Override + public void startElement(String uri, String localName, String name, + Attributes atts) throws SAXException { + if(localName.equals("td")) { + localName = "span"; + name = "span"; + + inCell = true; + if(needsComma) { + super.characters(comma, 0, 1); + needsComma = true; + } + } + super.startElement(uri, localName, name, atts); + } + + @Override + public void endElement(String uri, String localName, String name) + throws SAXException { + if(localName.equals("td")) { + localName = "span"; + name = "span"; + + needsComma = true; + inCell = false; + } + if(localName.equals("tr")) { + needsComma = false; + } + super.endElement(uri, localName, name); + } } } diff --git a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java index aa694dad30..651c5c8bf3 100644 --- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java +++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java @@ -22,8 +22,10 @@ import java.io.File; import java.io.InputStream; import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.repo.content.filestore.FileContentReader; import org.alfresco.repo.content.filestore.FileContentWriter; import org.alfresco.service.cmr.repository.ContentReader; +import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.TransformationOptions; import org.alfresco.util.TempFileProvider; @@ -32,7 +34,7 @@ import org.alfresco.util.TempFileProvider; * * @author Derek Hulley */ -public class PoiHssfContentTransformerTest extends AbstractContentTransformerTest +public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformerTest { private ContentTransformer transformer; @@ -56,12 +58,52 @@ public class PoiHssfContentTransformerTest extends AbstractContentTransformerTes { assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_EXCEL, new TransformationOptions())); assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_CSV, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); } - /** + public void testCsvOutput() throws Exception + { + File sourceFile = AbstractContentTransformerTest.loadQuickTestFile("xls"); + ContentReader sourceReader = new FileContentReader(sourceFile); + + File targetFile = TempFileProvider.createTempFile( + getClass().getSimpleName() + "_" + getName() + "_xls_", + ".csv"); + ContentWriter targetWriter = new FileContentWriter(targetFile); + + sourceReader.setMimetype(MimetypeMap.MIMETYPE_EXCEL); + targetWriter.setMimetype(MimetypeMap.MIMETYPE_TEXT_CSV); + transformer.transform(sourceReader, targetWriter); + + ContentReader targetReader = targetWriter.getReader(); + String checkContent = targetReader.getContentString(); + System.err.println(checkContent); + } + + @Override + protected void additionalContentCheck(String sourceMimetype, + String targetMimetype, String contents) { + if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) { + System.err.println(contents); + } else { + super.additionalContentCheck(sourceMimetype, targetMimetype, contents); + } + } + + @Override + protected boolean isQuickPhraseExpected(String targetMimetype) { + if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) { + return true; + } + return super.isQuickPhraseExpected(targetMimetype); + } + + /** * Tests a specific failure in the library */ - public void xtestBugFixAR114() throws Exception + public void xxtestBugFixAR114() throws Exception { File tempFile = TempFileProvider.createTempFile( getClass().getSimpleName() + "_" + getName() + "_", diff --git a/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformer.java b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformer.java new file mode 100644 index 0000000000..d1a2623ea0 --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformer.java @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.transform; + +import java.util.ArrayList; + +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; + +/** + * A Content Extractor for XML, HTML and Text, + * which makes use of the Apache Tika + * auto-detection to select the best parser + * to process your document. + * This will be used for all files which Tika can + * handle, but where no other more explicit + * extractor is defined. + * + * @author Nick Burch + */ +public class TikaAutoContentTransformer extends TikaPoweredContentTransformer +{ + /** + * We support all the mimetypes that the Tika + * auto-detect parser can handle, except for + * Image, Audio and Video ones which don't + * make much sense + */ + public static ArrayList SUPPORTED_MIMETYPES; + static { + SUPPORTED_MIMETYPES = new ArrayList(); + AutoDetectParser p = new AutoDetectParser(); + for(MediaType mt : p.getParsers().keySet()) { + if(mt.toString().startsWith("application/vnd.oasis.opendocument.formula")) { + // TODO Tika support for quick.odf, mimetype=application/vnd.oasis.opendocument.formula + // TODO Tika support for quick.otf, mimetype=application/vnd.oasis.opendocument.formula-template + continue; + } + if(mt.toString().startsWith("application/vnd.oasis.opendocument.graphics")) { + // TODO Tika support for quick.odg, mimetype=application/vnd.oasis.opendocument.graphics + // TODO Tika support for quick.otg, mimetype=application/vnd.oasis.opendocument.graphics-template + continue; + } + + if(mt.getType().equals("image") || + mt.getType().equals("audio") || + mt.getType().equals("video") || + mt.toString().equals("application/zip") || + mt.toString().equals("application/tar")) + { + // Skip these, as Tika mostly just does + // metadata rather than content + } + else + { + + // Tika can probably do some useful text + SUPPORTED_MIMETYPES.add( mt.toString() ); + } + } + } + + public TikaAutoContentTransformer() + { + super(SUPPORTED_MIMETYPES); + } + + /** + * Returns the Tika Auto-Detection + * parser, which will try to + * process all documents that Tika + * knows about + */ + protected Parser getParser() + { + return new AutoDetectParser(); + } +} diff --git a/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformerTest.java new file mode 100644 index 0000000000..72c5e098c1 --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformerTest.java @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.transform; + +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.cmr.repository.TransformationOptions; + +/** + * Most of the work for testing the Tika Auto-Detect transformer + * is automatically done by {@link AbstractContentTransformerTest} + * + * @see org.alfresco.repo.content.transform.TikaAutoContentTransformer + * + * @author Nick Burch + */ +public class TikaAutoContentTransformerTest extends TikaPoweredContentTransformerTest +{ + private ContentTransformer transformer; + + @Override + public void setUp() throws Exception + { + super.setUp(); + + transformer = new TikaAutoContentTransformer(); + } + + /** + * @return Returns the same transformer regardless - it is allowed + */ + protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype) + { + return transformer; + } + + /** + * Ensure we picked up a mixture of content + * types from Tika + */ + public void testIsTransformable() throws Exception + { + // Excel (but this isn't normally used) + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_EXCEL, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + + // Word + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + + // PDF + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PDF, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + + // Open Office + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + + // We don't do images + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + // Ditto music + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + } +} diff --git a/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java new file mode 100644 index 0000000000..2ef2c4992c --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java @@ -0,0 +1,192 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.transform; + +import java.io.InputStream; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.Arrays; +import java.util.List; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; + +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.cmr.repository.ContentReader; +import org.alfresco.service.cmr.repository.ContentWriter; +import org.alfresco.service.cmr.repository.TransformationOptions; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; + +/** + * Provides helpful services for {@link org.alfresco.repo.content.transform.ContentTransformer} + * implementations which are powered by Apache Tika. + * + * To use Tika to transform some content into Text, Html or XML, create an + * implementation of this / use the Auto Detect transformer. + * + * For now, all transformers are registered as regular, rather than explicit + * transformations. This should allow you to register your own explicit + * transformers and have them nicely take priority. + * + * @author Nick Burch + */ +public abstract class TikaPoweredContentTransformer extends AbstractContentTransformer2 +{ + private static final Log logger = LogFactory.getLog(TikaPoweredContentTransformer.class); + protected List sourceMimeTypes; + + /** + * Windows carriage return line feed pair. + */ + protected static final String LINE_BREAK = "\r\n"; + public static final String WRONG_FORMAT_MESSAGE_ID = "transform.err.format_or_password"; + + protected TikaPoweredContentTransformer(List sourceMimeTypes) { + this.sourceMimeTypes = sourceMimeTypes; + } + protected TikaPoweredContentTransformer(String[] sourceMimeTypes) { + this(Arrays.asList(sourceMimeTypes)); + } + + /** + * Returns the correct Tika Parser to process + * the document. + * If you don't know which you want, use + * {@link TikaAutoContentTransformer} which + * makes use of the Tika auto-detection. + */ + protected abstract Parser getParser(); + + /** + * Can we do the requested transformation via Tika? + * We support transforming to HTML, XML or Text + */ + public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options) + { + if(! sourceMimeTypes.contains(sourceMimetype)) + { + // The source isn't one of ours + return false; + } + + if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype) || + MimetypeMap.MIMETYPE_HTML.equals(targetMimetype) || + MimetypeMap.MIMETYPE_XML.equals(targetMimetype)) + { + // We can output to this + return true; + } + else + { + // We support the source, but not the target + return false; + } + } + + /** + * Returns an appropriate Tika ContentHandler for the + * requested content type. Normally you'll let this + * work as default, but if you need fine-grained + * control of how the Tika events become text then + * override and supply your own. + */ + protected ContentHandler getContentHandler(String targetMimeType, Writer output) + throws TransformerConfigurationException + { + if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimeType)) + { + return new BodyContentHandler(output); + } + + SAXTransformerFactory factory = (SAXTransformerFactory) + SAXTransformerFactory.newInstance(); + TransformerHandler handler = factory.newTransformerHandler(); + handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); + handler.setResult(new StreamResult(output)); + + if(MimetypeMap.MIMETYPE_HTML.equals(targetMimeType)) + { + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); + } + else if(MimetypeMap.MIMETYPE_XML.equals(targetMimeType)) + { + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); + } + else + { + throw new TransformerInfoException( + WRONG_FORMAT_MESSAGE_ID, + new IllegalArgumentException("Requested target type " + targetMimeType + " not supported") + ); + } + return handler; + } + + public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options) + throws Exception + { + InputStream is = reader.getContentInputStream(); + OutputStream os = writer.getContentOutputStream(); + String encoding = writer.getEncoding(); + String targetMimeType = writer.getMimetype(); + + Writer ow = new OutputStreamWriter(os, encoding); + + Parser parser = getParser(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + + ContentHandler handler = getContentHandler(targetMimeType, ow); + if(handler == null) + { + throw new TransformerConfigurationException( + "Unable to create Tika Handler for configured output " + targetMimeType + ); + } + + try { + parser.parse(is, handler, metadata, context); + } + finally + { + if (is != null) + { + try { is.close(); } catch (Throwable e) {} + } + if (ow != null) + { + try { ow.close(); } catch (Throwable e) {} + } + if (os != null) + { + try { os.close(); } catch (Throwable e) {} + } + } + } +} diff --git a/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformerTest.java new file mode 100644 index 0000000000..9751807049 --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformerTest.java @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.transform; + +import org.alfresco.repo.content.MimetypeMap; + +/** + * Parent test for Tika powered transformer tests + * + * @author Nick Burch + */ +public abstract class TikaPoweredContentTransformerTest extends AbstractContentTransformerTest +{ + protected boolean isQuickPhraseExpected(String targetMimetype) + { + return ( + targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN) || + targetMimetype.equals(MimetypeMap.MIMETYPE_HTML) || + targetMimetype.equals(MimetypeMap.MIMETYPE_XML) + ); + } + protected boolean isQuickWordsExpected(String targetMimetype) + { + return ( + targetMimetype.startsWith(StringExtractingContentTransformer.PREFIX_TEXT) || + targetMimetype.equals(MimetypeMap.MIMETYPE_HTML) || + targetMimetype.equals(MimetypeMap.MIMETYPE_XML) + ); + } + + /** + * Tests for html vs xml vs plain text + */ + protected void additionalContentCheck(String sourceMimetype, String targetMimetype, String contents) + { + if(targetMimetype.equals(MimetypeMap.MIMETYPE_XML)) + { + // Look for header and footer to confirm it was translated + assertTrue( + "XML header not found", + contents.contains("") + ); + } + else if(targetMimetype.equals(MimetypeMap.MIMETYPE_HTML)) + { + // Look for header and footer to confirm it was translated + assertFalse( + "XML header found but shouldn't be there for HTML", + contents.contains("") + ); + } + else if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN)) + { + // Ensure it really is plain text not xml/html + assertFalse( + "XML header found but shouldn't be there for Plain Text", + contents.contains("") + ); + } + } +}