diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml
index 21b5414923..06cdb84395 100644
--- a/config/alfresco/content-services-context.xml
+++ b/config/alfresco/content-services-context.xml
@@ -363,6 +363,10 @@
+
+
diff --git a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java
index bdcfce573d..30f44844e2 100644
--- a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java
+++ b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java
@@ -28,6 +28,7 @@ import org.alfresco.repo.content.metadata.OpenOfficeMetadataExtracterTest;
import org.alfresco.repo.content.metadata.PdfBoxMetadataExtracterTest;
import org.alfresco.repo.content.metadata.PoiMetadataExtracterTest;
import org.alfresco.repo.content.metadata.RFC822MetadataExtracterTest;
+import org.alfresco.repo.content.metadata.TikaAutoMetadataExtracterTest;
import org.alfresco.repo.content.transform.BinaryPassThroughContentTransformerTest;
import org.alfresco.repo.content.transform.ComplexContentTransformerTest;
import org.alfresco.repo.content.transform.ContentTransformerRegistryTest;
@@ -41,6 +42,7 @@ import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTe
import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
import org.alfresco.repo.content.transform.TextMiningContentTransformerTest;
import org.alfresco.repo.content.transform.TextToPdfContentTransformerTest;
+import org.alfresco.repo.content.transform.TikaAutoContentTransformerTest;
import org.alfresco.repo.content.transform.magick.ImageMagickContentTransformerTest;
import org.alfresco.util.ApplicationContextHelper;
import org.springframework.context.ApplicationContext;
@@ -91,6 +93,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
suite.addTestSuite( PdfBoxMetadataExtracterTest.class );
suite.addTestSuite( PoiMetadataExtracterTest.class );
suite.addTestSuite( RFC822MetadataExtracterTest.class );
+ suite.addTestSuite( TikaAutoMetadataExtracterTest.class );
// Transform tests
suite.addTestSuite(BinaryPassThroughContentTransformerTest.class);
@@ -106,6 +109,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
suite.addTestSuite(StringExtractingContentTransformerTest.class);
suite.addTestSuite(TextMiningContentTransformerTest.class);
suite.addTestSuite(TextToPdfContentTransformerTest.class);
+ suite.addTestSuite(TikaAutoContentTransformerTest.class);
suite.addTestSuite(ImageMagickContentTransformerTest.class);
return suite;
diff --git a/source/java/org/alfresco/repo/content/MimetypeMap.java b/source/java/org/alfresco/repo/content/MimetypeMap.java
index d5490136f9..947a3201df 100644
--- a/source/java/org/alfresco/repo/content/MimetypeMap.java
+++ b/source/java/org/alfresco/repo/content/MimetypeMap.java
@@ -52,6 +52,7 @@ public class MimetypeMap implements MimetypeService
public static final String MIMETYPE_TEXT_PLAIN = "text/plain";
public static final String MIMETYPE_TEXT_MEDIAWIKI = "text/mediawiki";
public static final String MIMETYPE_TEXT_CSS = "text/css";
+ public static final String MIMETYPE_TEXT_CSV = "text/csv";
public static final String MIMETYPE_TEXT_JAVASCRIPT = "text/javascript";
public static final String MIMETYPE_XML = "text/xml";
public static final String MIMETYPE_HTML = "text/html";
diff --git a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java
index 5e7dad0c6b..c6c02c3f99 100644
--- a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java
@@ -238,6 +238,9 @@ public abstract class AbstractContentTransformerTest extends TestCase
" source: " + sourceReader + "\n" +
" target: " + targetWriter,
checkContent.contains(QUICK_CONTENT));
+
+ // Let subclasses do extra checks if they want
+ additionalContentCheck(sourceMimetype, targetMimetype, checkContent);
}
else if (isQuickWordsExpected(targetMimetype))
{
@@ -280,6 +283,13 @@ public abstract class AbstractContentTransformerTest extends TestCase
outputWriter.setEncoding("UTF8");
outputWriter.putContent(sb.toString());
}
+
+ /**
+ * Allows implementations to do some extra checks on the
+ * results of the content as found by
+ * {@link #testAllConversions()}
+ */
+ protected void additionalContentCheck(String sourceMimetype, String targetMimetype, String contents) {}
/**
* This method is an extension point for enabling/disabling an assertion that the "quick brown fox"
diff --git a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
index c5bc7f959f..5bc453e5c0 100644
--- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
@@ -18,267 +18,164 @@
*/
package org.alfresco.repo.content.transform;
-import java.io.InputStream;
-import java.io.OutputStream;
+import java.io.Writer;
+import java.util.regex.Pattern;
+
+import javax.xml.transform.TransformerConfigurationException;
import org.alfresco.repo.content.MimetypeMap;
-import org.alfresco.service.cmr.repository.ContentReader;
-import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.poi.hssf.usermodel.HSSFCell;
-import org.apache.poi.hssf.usermodel.HSSFRow;
-import org.apache.poi.hssf.usermodel.HSSFSheet;
-import org.apache.poi.hssf.usermodel.HSSFWorkbook;
-import org.apache.poi.util.RecordFormatException;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * Makes use of the {@link http://jakarta.apache.org/poi/ POI} library to
- * perform conversions from Excel spreadsheets to text (comma separated).
- *
- * While most text extraction from spreadsheets only extract the first sheet of
- * the workbook, the method used here extracts the text from all the sheets.
- * This is more useful, especially when it comes to indexing spreadsheets.
- *
- * In the case where there is only one sheet in the document, the results will be
- * exactly the same as most extractors. Where there are multiple sheets, the results
- * will differ, but meaningful reimporting of the text document is not possible
- * anyway.
+ * Uses {@link http://tika.apache.org/ Apache Tika} and
+ * {@link http://poi.apache.org/ Apache POI} to perform
+ * conversions from Excel spreadsheets.
+ *
Will transform from Excel spreadsheets into Html,
+ * Xml or Text (space or comma separated)
+ *
Handles all sheets in the file.
*
+ * TODO CSV Support
+ *
+ * @author Nick Burch
* @author Derek Hulley
*/
-public class PoiHssfContentTransformer extends AbstractContentTransformer2
+public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
{
/**
* Error message to delegate to NodeInfoBean
*/
public static final String WRONG_FORMAT_MESSAGE_ID = "transform.err.format_or_password";
-
- /**
- * Windows carriage return line feed pair.
- */
- private static final String LINE_BREAK = "\r\n";
private static Log logger = LogFactory.getLog(PoiHssfContentTransformer.class);
+ public PoiHssfContentTransformer()
+ {
+ super(new String[] {
+ MimetypeMap.MIMETYPE_EXCEL
+ });
+ }
+
+ @Override
+ protected Parser getParser()
+ {
+ return new OfficeParser();
+ }
+
/**
- * Currently the only transformation performed is that of text extraction from XLS documents.
+ * Can we do the requested transformation via Tika?
+ * We support transforming to HTML, XML, Text or CSV
*/
+ @Override
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
{
- if (!MimetypeMap.MIMETYPE_EXCEL.equals(sourceMimetype) ||
- !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
- {
- // only support XLS -> Text
- return false;
- }
- else
- {
- return true;
- }
+ if(sourceMimeTypes.contains(sourceMimetype) &&
+ MimetypeMap.MIMETYPE_TEXT_CSV.equals(targetMimetype))
+ {
+ // Special case for CSV
+ return true;
+ }
+
+ // Otherwise fall back on the default Tika rules
+ return super.isTransformable(sourceMimetype, targetMimetype, options);
}
-
- public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
- throws Exception
+
+ @Override
+ protected ContentHandler getContentHandler(String targetMimeType, Writer output)
+ throws TransformerConfigurationException
{
- InputStream is = reader.getContentInputStream();
- OutputStream os = writer.getContentOutputStream();
- String encoding = writer.getEncoding();
- try
- {
- // open the workbook
- HSSFWorkbook workbook = new HSSFWorkbook(is);
- // how many sheets are there?
- int sheetCount = workbook.getNumberOfSheets();
- // transform each sheet
- for (int i = 0; i < sheetCount; i++)
- {
- HSSFSheet sheet = workbook.getSheetAt(i);
- String sheetName = workbook.getSheetName(i);
- writeSheet(os, sheet, encoding);
- // write the sheet name
- PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
- PoiHssfContentTransformer.writeString(os, encoding, "End of sheet: " + sheetName, true);
- PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
- PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
- }
- }
- catch (RecordFormatException ex)
- {
- // Catching specific exception to propagate it to NodeInfoBean
- // to fix issue https://issues.alfresco.com/jira/browse/ETWOTWO-440
-
- logger.error(ex);
- throw new TransformerInfoException(WRONG_FORMAT_MESSAGE_ID, ex);
- }
- finally
- {
- if (is != null)
- {
- try { is.close(); } catch (Throwable e) {}
- }
- if (os != null)
- {
- try { os.close(); } catch (Throwable e) {}
- }
- }
+ if(MimetypeMap.MIMETYPE_TEXT_CSV.equals(targetMimeType))
+ {
+ return new CsvContentHandler(output);
+ }
+
+ // Otherwise use the normal Tika rules
+ return super.getContentHandler(targetMimeType, output);
}
/**
- * Dumps the text from the sheet to the stream in CSV format
+ * A wrapper around the normal Tika BodyContentHandler,
+ * which causes things to be CSV encoded rather than
+ * tab separated
+ * TODO Get rid of the extra tabs that crop up
*/
- private void writeSheet(OutputStream os, HSSFSheet sheet, String encoding) throws Exception
- {
- int rows = sheet.getLastRowNum();
- // transform each row
- for (int i = 0; i <= rows; i++)
- {
- HSSFRow row = sheet.getRow(i);
- if (row != null)
- {
- writeRow(os, row, encoding);
- }
- // break between rows
- if (i < rows)
- {
- PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
- }
- }
- }
-
- private void writeRow(OutputStream os, HSSFRow row, String encoding) throws Exception
- {
- short firstCellNum = row.getFirstCellNum();
- short lastCellNum = row.getLastCellNum();
- // pad out to first cell
- for (int i = 0; i < firstCellNum; i++)
- {
- PoiHssfContentTransformer.writeString(os, encoding, ",", false); // CSV up to first cell
- }
- // write each cell
- for (int i = 0; i <= lastCellNum; i++)
- {
- HSSFCell cell = row.getCell(i);
- if (cell != null)
- {
- int cellType = cell.getCellType();
+ protected static class CsvContentHandler extends BodyContentHandler {
+ private static final char[] comma = new char[]{ ',' };
+ private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+");
+
+ private boolean inCell = false;
+ private boolean needsComma = false;
+
+ protected CsvContentHandler(Writer output) {
+ super(output);
+ }
- StringBuilder sb = new StringBuilder(10);
- switch (cellType)
- {
- case HSSFCell.CELL_TYPE_BLANK:
- // ignore
- break;
- case HSSFCell.CELL_TYPE_BOOLEAN:
- sb.append(cell.getBooleanCellValue());
- break;
- case HSSFCell.CELL_TYPE_ERROR:
- sb.append("ERROR");
- break;
- case HSSFCell.CELL_TYPE_NUMERIC:
- sb.append(cell.getNumericCellValue());
- break;
- case HSSFCell.CELL_TYPE_STRING:
- sb.append(cell.getStringCellValue());
- break;
- case HSSFCell.CELL_TYPE_FORMULA:
- final int formulaResultType = cell.getCachedFormulaResultType();
- if (HSSFCell.CELL_TYPE_NUMERIC == formulaResultType)
- {
- sb.append(cell.getNumericCellValue());
- }
- else if (HSSFCell.CELL_TYPE_STRING == formulaResultType)
- {
- sb.append(cell.getStringCellValue());
- }
- else if (HSSFCell.CELL_TYPE_BOOLEAN == formulaResultType)
- {
- sb.append(cell.getBooleanCellValue());
- }
- else if (HSSFCell.CELL_TYPE_ERROR == formulaResultType)
- {
- sb.append(cell.getErrorCellValue());
- }
- else
- {
- throw new RuntimeException("Unknown formula result type: " + formulaResultType);
- }
- break;
- default:
- throw new RuntimeException("Unknown HSSF cell type: " + cell);
- }
- String data = sb.toString();
- PoiHssfContentTransformer.writeString(os, encoding, data, true);
- }
- // comma separate if required
- if (i < lastCellNum)
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if(inCell) {
+ StringBuffer t = new StringBuffer(new String(ch,start,length));
+
+ // Quote if not all numbers
+ if(all_nums.matcher(t).matches())
{
- PoiHssfContentTransformer.writeString(os, encoding, ",", false);
+ super.characters(ch, start, length);
}
- }
- }
-
- /**
- * Writes the given data to the stream using the encoding specified. If the encoding
- * is not given, the default String to byte[] conversion will be
- * used.
- *
- * The given data string will be escaped appropriately.
- *
- * @param os the stream to write to
- * @param encoding the encoding to use, or null if the default encoding is acceptable
- * @param value the string to write
- * @param isData true if the value represents a human-readable string, false if the
- * value represents formatting characters, separating characters, etc.
- * @throws Exception
- */
- public static void writeString(OutputStream os, String encoding, String value, boolean isData) throws Exception
- {
- if (value == null)
- {
- // nothing to do
- return;
- }
- int dataLength = value.length();
- if (dataLength == 0)
- {
- // nothing to do
- return;
- }
-
- // escape the string
- StringBuilder sb = new StringBuilder(dataLength + 5); // slightly longer than the data
- for (int i = 0; i < dataLength; i++)
- {
- char currentChar = value.charAt(i);
- if (currentChar == '\"') // inverted commas
+ else
{
- sb.append("\""); // CSV escaping of inverted commas
+ for(int i=t.length()-1; i>=0; i--) {
+ if(t.charAt(i) == '\"') {
+ // Double up double quotes
+ t.insert(i, '\"');
+ i--;
+ }
+ }
+ t.insert(0, '\"');
+ t.append('\"');
+ char[] c = t.toString().toCharArray();
+ super.characters(c, 0, c.length);
}
- // append the char
- sb.append(currentChar);
- }
- // enclose in inverted commas for safety
- if (isData)
- {
- sb.insert(0, "\"");
- sb.append("\"");
- }
- // escaping complete
- value = sb.toString();
-
- byte[] bytes = null;
- if (encoding == null)
- {
- // use default encoding
- bytes = value.getBytes();
- }
- else
- {
- bytes = value.getBytes(encoding);
- }
- // write to the stream
- os.write(bytes);
- // done
+ } else {
+ super.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String name,
+ Attributes atts) throws SAXException {
+ if(localName.equals("td")) {
+ localName = "span";
+ name = "span";
+
+ inCell = true;
+ if(needsComma) {
+ super.characters(comma, 0, 1);
+ needsComma = true;
+ }
+ }
+ super.startElement(uri, localName, name, atts);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ if(localName.equals("td")) {
+ localName = "span";
+ name = "span";
+
+ needsComma = true;
+ inCell = false;
+ }
+ if(localName.equals("tr")) {
+ needsComma = false;
+ }
+ super.endElement(uri, localName, name);
+ }
}
}
diff --git a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
index aa694dad30..651c5c8bf3 100644
--- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
@@ -22,8 +22,10 @@ import java.io.File;
import java.io.InputStream;
import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.repo.content.filestore.FileContentReader;
import org.alfresco.repo.content.filestore.FileContentWriter;
import org.alfresco.service.cmr.repository.ContentReader;
+import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.alfresco.util.TempFileProvider;
@@ -32,7 +34,7 @@ import org.alfresco.util.TempFileProvider;
*
* @author Derek Hulley
*/
-public class PoiHssfContentTransformerTest extends AbstractContentTransformerTest
+public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformerTest
{
private ContentTransformer transformer;
@@ -56,12 +58,52 @@ public class PoiHssfContentTransformerTest extends AbstractContentTransformerTes
{
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_EXCEL, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_CSV, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
}
- /**
+ public void testCsvOutput() throws Exception
+ {
+ File sourceFile = AbstractContentTransformerTest.loadQuickTestFile("xls");
+ ContentReader sourceReader = new FileContentReader(sourceFile);
+
+ File targetFile = TempFileProvider.createTempFile(
+ getClass().getSimpleName() + "_" + getName() + "_xls_",
+ ".csv");
+ ContentWriter targetWriter = new FileContentWriter(targetFile);
+
+ sourceReader.setMimetype(MimetypeMap.MIMETYPE_EXCEL);
+ targetWriter.setMimetype(MimetypeMap.MIMETYPE_TEXT_CSV);
+ transformer.transform(sourceReader, targetWriter);
+
+ ContentReader targetReader = targetWriter.getReader();
+ String checkContent = targetReader.getContentString();
+ System.err.println(checkContent);
+ }
+
+ @Override
+ protected void additionalContentCheck(String sourceMimetype,
+ String targetMimetype, String contents) {
+ if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
+ System.err.println(contents);
+ } else {
+ super.additionalContentCheck(sourceMimetype, targetMimetype, contents);
+ }
+ }
+
+ @Override
+ protected boolean isQuickPhraseExpected(String targetMimetype) {
+ if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
+ return true;
+ }
+ return super.isQuickPhraseExpected(targetMimetype);
+ }
+
+ /**
* Tests a specific failure in the library
*/
- public void xtestBugFixAR114() throws Exception
+ public void xxtestBugFixAR114() throws Exception
{
File tempFile = TempFileProvider.createTempFile(
getClass().getSimpleName() + "_" + getName() + "_",
diff --git a/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformer.java b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformer.java
new file mode 100644
index 0000000000..d1a2623ea0
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformer.java
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see .
+ */
+package org.alfresco.repo.content.transform;
+
+import java.util.ArrayList;
+
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+
+/**
+ * A Content Extractor for XML, HTML and Text,
+ * which makes use of the Apache Tika
+ * auto-detection to select the best parser
+ * to process your document.
+ * This will be used for all files which Tika can
+ * handle, but where no other more explicit
+ * extractor is defined.
+ *
+ * @author Nick Burch
+ */
+public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
+{
+ /**
+ * We support all the mimetypes that the Tika
+ * auto-detect parser can handle, except for
+ * Image, Audio and Video ones which don't
+ * make much sense
+ */
+ public static ArrayList SUPPORTED_MIMETYPES;
+ static {
+ SUPPORTED_MIMETYPES = new ArrayList();
+ AutoDetectParser p = new AutoDetectParser();
+ for(MediaType mt : p.getParsers().keySet()) {
+ if(mt.toString().startsWith("application/vnd.oasis.opendocument.formula")) {
+ // TODO Tika support for quick.odf, mimetype=application/vnd.oasis.opendocument.formula
+ // TODO Tika support for quick.otf, mimetype=application/vnd.oasis.opendocument.formula-template
+ continue;
+ }
+ if(mt.toString().startsWith("application/vnd.oasis.opendocument.graphics")) {
+ // TODO Tika support for quick.odg, mimetype=application/vnd.oasis.opendocument.graphics
+ // TODO Tika support for quick.otg, mimetype=application/vnd.oasis.opendocument.graphics-template
+ continue;
+ }
+
+ if(mt.getType().equals("image") ||
+ mt.getType().equals("audio") ||
+ mt.getType().equals("video") ||
+ mt.toString().equals("application/zip") ||
+ mt.toString().equals("application/tar"))
+ {
+ // Skip these, as Tika mostly just does
+ // metadata rather than content
+ }
+ else
+ {
+
+ // Tika can probably do some useful text
+ SUPPORTED_MIMETYPES.add( mt.toString() );
+ }
+ }
+ }
+
+ public TikaAutoContentTransformer()
+ {
+ super(SUPPORTED_MIMETYPES);
+ }
+
+ /**
+ * Returns the Tika Auto-Detection
+ * parser, which will try to
+ * process all documents that Tika
+ * knows about
+ */
+ protected Parser getParser()
+ {
+ return new AutoDetectParser();
+ }
+}
diff --git a/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformerTest.java
new file mode 100644
index 0000000000..72c5e098c1
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformerTest.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see .
+ */
+package org.alfresco.repo.content.transform;
+
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.TransformationOptions;
+
+/**
+ * Most of the work for testing the Tika Auto-Detect transformer
+ * is automatically done by {@link AbstractContentTransformerTest}
+ *
+ * @see org.alfresco.repo.content.transform.TikaAutoContentTransformer
+ *
+ * @author Nick Burch
+ */
+public class TikaAutoContentTransformerTest extends TikaPoweredContentTransformerTest
+{
+ private ContentTransformer transformer;
+
+ @Override
+ public void setUp() throws Exception
+ {
+ super.setUp();
+
+ transformer = new TikaAutoContentTransformer();
+ }
+
+ /**
+ * @return Returns the same transformer regardless - it is allowed
+ */
+ protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
+ {
+ return transformer;
+ }
+
+ /**
+ * Ensure we picked up a mixture of content
+ * types from Tika
+ */
+ public void testIsTransformable() throws Exception
+ {
+ // Excel (but this isn't normally used)
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_EXCEL, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+
+ // Word
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+
+ // PDF
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PDF, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+
+ // Open Office
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+
+ // We don't do images
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+ // Ditto music
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+ }
+}
diff --git a/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java
new file mode 100644
index 0000000000..2ef2c4992c
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java
@@ -0,0 +1,192 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see .
+ */
+package org.alfresco.repo.content.transform;
+
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Arrays;
+import java.util.List;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.ContentReader;
+import org.alfresco.service.cmr.repository.ContentWriter;
+import org.alfresco.service.cmr.repository.TransformationOptions;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Provides helpful services for {@link org.alfresco.repo.content.transform.ContentTransformer}
+ * implementations which are powered by Apache Tika.
+ *
+ * To use Tika to transform some content into Text, Html or XML, create an
+ * implementation of this / use the Auto Detect transformer.
+ *
+ * For now, all transformers are registered as regular, rather than explicit
+ * transformations. This should allow you to register your own explicit
+ * transformers and have them nicely take priority.
+ *
+ * @author Nick Burch
+ */
+public abstract class TikaPoweredContentTransformer extends AbstractContentTransformer2
+{
+ private static final Log logger = LogFactory.getLog(TikaPoweredContentTransformer.class);
+ protected List sourceMimeTypes;
+
+ /**
+ * Windows carriage return line feed pair.
+ */
+ protected static final String LINE_BREAK = "\r\n";
+ public static final String WRONG_FORMAT_MESSAGE_ID = "transform.err.format_or_password";
+
+ protected TikaPoweredContentTransformer(List sourceMimeTypes) {
+ this.sourceMimeTypes = sourceMimeTypes;
+ }
+ protected TikaPoweredContentTransformer(String[] sourceMimeTypes) {
+ this(Arrays.asList(sourceMimeTypes));
+ }
+
+ /**
+ * Returns the correct Tika Parser to process
+ * the document.
+ * If you don't know which you want, use
+ * {@link TikaAutoContentTransformer} which
+ * makes use of the Tika auto-detection.
+ */
+ protected abstract Parser getParser();
+
+ /**
+ * Can we do the requested transformation via Tika?
+ * We support transforming to HTML, XML or Text
+ */
+ public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
+ {
+ if(! sourceMimeTypes.contains(sourceMimetype))
+ {
+ // The source isn't one of ours
+ return false;
+ }
+
+ if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype) ||
+ MimetypeMap.MIMETYPE_HTML.equals(targetMimetype) ||
+ MimetypeMap.MIMETYPE_XML.equals(targetMimetype))
+ {
+ // We can output to this
+ return true;
+ }
+ else
+ {
+ // We support the source, but not the target
+ return false;
+ }
+ }
+
+ /**
+ * Returns an appropriate Tika ContentHandler for the
+ * requested content type. Normally you'll let this
+ * work as default, but if you need fine-grained
+ * control of how the Tika events become text then
+ * override and supply your own.
+ */
+ protected ContentHandler getContentHandler(String targetMimeType, Writer output)
+ throws TransformerConfigurationException
+ {
+ if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimeType))
+ {
+ return new BodyContentHandler(output);
+ }
+
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(output));
+
+ if(MimetypeMap.MIMETYPE_HTML.equals(targetMimeType))
+ {
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+ }
+ else if(MimetypeMap.MIMETYPE_XML.equals(targetMimeType))
+ {
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ }
+ else
+ {
+ throw new TransformerInfoException(
+ WRONG_FORMAT_MESSAGE_ID,
+ new IllegalArgumentException("Requested target type " + targetMimeType + " not supported")
+ );
+ }
+ return handler;
+ }
+
+ public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
+ throws Exception
+ {
+ InputStream is = reader.getContentInputStream();
+ OutputStream os = writer.getContentOutputStream();
+ String encoding = writer.getEncoding();
+ String targetMimeType = writer.getMimetype();
+
+ Writer ow = new OutputStreamWriter(os, encoding);
+
+ Parser parser = getParser();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ ContentHandler handler = getContentHandler(targetMimeType, ow);
+ if(handler == null)
+ {
+ throw new TransformerConfigurationException(
+ "Unable to create Tika Handler for configured output " + targetMimeType
+ );
+ }
+
+ try {
+ parser.parse(is, handler, metadata, context);
+ }
+ finally
+ {
+ if (is != null)
+ {
+ try { is.close(); } catch (Throwable e) {}
+ }
+ if (ow != null)
+ {
+ try { ow.close(); } catch (Throwable e) {}
+ }
+ if (os != null)
+ {
+ try { os.close(); } catch (Throwable e) {}
+ }
+ }
+ }
+}
diff --git a/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformerTest.java
new file mode 100644
index 0000000000..9751807049
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformerTest.java
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see .
+ */
+package org.alfresco.repo.content.transform;
+
+import org.alfresco.repo.content.MimetypeMap;
+
+/**
+ * Parent test for Tika powered transformer tests
+ *
+ * @author Nick Burch
+ */
+public abstract class TikaPoweredContentTransformerTest extends AbstractContentTransformerTest
+{
+ protected boolean isQuickPhraseExpected(String targetMimetype)
+ {
+ return (
+ targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN) ||
+ targetMimetype.equals(MimetypeMap.MIMETYPE_HTML) ||
+ targetMimetype.equals(MimetypeMap.MIMETYPE_XML)
+ );
+ }
+ protected boolean isQuickWordsExpected(String targetMimetype)
+ {
+ return (
+ targetMimetype.startsWith(StringExtractingContentTransformer.PREFIX_TEXT) ||
+ targetMimetype.equals(MimetypeMap.MIMETYPE_HTML) ||
+ targetMimetype.equals(MimetypeMap.MIMETYPE_XML)
+ );
+ }
+
+ /**
+ * Tests for html vs xml vs plain text
+ */
+ protected void additionalContentCheck(String sourceMimetype, String targetMimetype, String contents)
+ {
+ if(targetMimetype.equals(MimetypeMap.MIMETYPE_XML))
+ {
+ // Look for header and footer to confirm it was translated
+ assertTrue(
+ "XML header not found",
+ contents.contains("")
+ );
+ }
+ else if(targetMimetype.equals(MimetypeMap.MIMETYPE_HTML))
+ {
+ // Look for header and footer to confirm it was translated
+ assertFalse(
+ "XML header found but shouldn't be there for HTML",
+ contents.contains("")
+ );
+ }
+ else if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN))
+ {
+ // Ensure it really is plain text not xml/html
+ assertFalse(
+ "XML header found but shouldn't be there for Plain Text",
+ contents.contains("")
+ );
+ }
+ }
+}