Initial Tika support for Text content transforms

The POI HSSF transformer has been updated to use Tika. A Tika auto-detect transformer has also been added, which caters for a large number of previously un-handled cases. Unit tests check this. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20769 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-08-07 17:49:17 +00:00 · 2010-06-23 11:40:17 +00:00
parent 4ccc015f5f
commit f3a7a0aa7c
10 changed files with 670 additions and 233 deletions
--- a/config/alfresco/content-services-context.xml
+++ b/config/alfresco/content-services-context.xml
@@ -363,6 +363,10 @@
      </property>
   </bean>

+   <bean id="transformer.TikaAuto"
+         class="org.alfresco.repo.content.transform.TikaAutoContentTransformer"
+         parent="baseContentTransformer" />
+
   <bean id="transformer.Poi"
         class="org.alfresco.repo.content.transform.PoiHssfContentTransformer"
         parent="baseContentTransformer" />
--- a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java
+++ b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java
@@ -28,6 +28,7 @@ import org.alfresco.repo.content.metadata.OpenOfficeMetadataExtracterTest;
 import org.alfresco.repo.content.metadata.PdfBoxMetadataExtracterTest;
 import org.alfresco.repo.content.metadata.PoiMetadataExtracterTest;
 import org.alfresco.repo.content.metadata.RFC822MetadataExtracterTest;
+import org.alfresco.repo.content.metadata.TikaAutoMetadataExtracterTest;
 import org.alfresco.repo.content.transform.BinaryPassThroughContentTransformerTest;
 import org.alfresco.repo.content.transform.ComplexContentTransformerTest;
 import org.alfresco.repo.content.transform.ContentTransformerRegistryTest;
@@ -41,6 +42,7 @@ import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTe
 import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
 import org.alfresco.repo.content.transform.TextMiningContentTransformerTest;
 import org.alfresco.repo.content.transform.TextToPdfContentTransformerTest;
+import org.alfresco.repo.content.transform.TikaAutoContentTransformerTest;
 import org.alfresco.repo.content.transform.magick.ImageMagickContentTransformerTest;
 import org.alfresco.util.ApplicationContextHelper;
 import org.springframework.context.ApplicationContext;
@@ -91,6 +93,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
       suite.addTestSuite( PdfBoxMetadataExtracterTest.class );
       suite.addTestSuite( PoiMetadataExtracterTest.class );
       suite.addTestSuite( RFC822MetadataExtracterTest.class );
+       suite.addTestSuite( TikaAutoMetadataExtracterTest.class );
       
       // Transform tests
       suite.addTestSuite(BinaryPassThroughContentTransformerTest.class);
@@ -106,6 +109,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
       suite.addTestSuite(StringExtractingContentTransformerTest.class);
       suite.addTestSuite(TextMiningContentTransformerTest.class);
       suite.addTestSuite(TextToPdfContentTransformerTest.class);
+       suite.addTestSuite(TikaAutoContentTransformerTest.class);
       suite.addTestSuite(ImageMagickContentTransformerTest.class);
       
       return suite;
--- a/source/java/org/alfresco/repo/content/MimetypeMap.java
+++ b/source/java/org/alfresco/repo/content/MimetypeMap.java
@@ -52,6 +52,7 @@ public class MimetypeMap implements MimetypeService
    public static final String MIMETYPE_TEXT_PLAIN = "text/plain";
    public static final String MIMETYPE_TEXT_MEDIAWIKI = "text/mediawiki";
    public static final String MIMETYPE_TEXT_CSS = "text/css";    
+    public static final String MIMETYPE_TEXT_CSV = "text/csv";
    public static final String MIMETYPE_TEXT_JAVASCRIPT = "text/javascript";    
    public static final String MIMETYPE_XML = "text/xml";
    public static final String MIMETYPE_HTML = "text/html";
--- a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java
@@ -238,6 +238,9 @@ public abstract class AbstractContentTransformerTest extends TestCase
                                "   source: " + sourceReader + "\n" +
                                "   target: " + targetWriter,
                                checkContent.contains(QUICK_CONTENT));
+                        
+                        // Let subclasses do extra checks if they want
+                        additionalContentCheck(sourceMimetype, targetMimetype, checkContent);
                    }
                    else if (isQuickWordsExpected(targetMimetype))
                    {
@@ -280,6 +283,13 @@ public abstract class AbstractContentTransformerTest extends TestCase
        outputWriter.setEncoding("UTF8");
        outputWriter.putContent(sb.toString());
    }
+    
+    /**
+     * Allows implementations to do some extra checks on the 
+     *  results of the content as found by 
+     *  {@link #testAllConversions()}
+     */
+    protected void additionalContentCheck(String sourceMimetype, String targetMimetype, String contents) {}

    /**
     * This method is an extension point for enabling/disabling an assertion that the "quick brown fox"
--- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
@@ -18,267 +18,164 @@
 */
 package org.alfresco.repo.content.transform;

-import java.io.InputStream;
-import java.io.OutputStream;
+import java.io.Writer;
+import java.util.regex.Pattern;
+
+import javax.xml.transform.TransformerConfigurationException;

 import org.alfresco.repo.content.MimetypeMap;
-import org.alfresco.service.cmr.repository.ContentReader;
-import org.alfresco.service.cmr.repository.ContentWriter;
 import org.alfresco.service.cmr.repository.TransformationOptions;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.poi.hssf.usermodel.HSSFCell;
-import org.apache.poi.hssf.usermodel.HSSFRow;
-import org.apache.poi.hssf.usermodel.HSSFSheet;
-import org.apache.poi.hssf.usermodel.HSSFWorkbook;
-import org.apache.poi.util.RecordFormatException;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;

 /**
- * Makes use of the {@link http://jakarta.apache.org/poi/ POI} library to
- * perform conversions from Excel spreadsheets to text (comma separated).
- * <p>
- * While most text extraction from spreadsheets only extract the first sheet of
- * the workbook, the method used here extracts the text from <b>all the sheets</b>.
- * This is more useful, especially when it comes to indexing spreadsheets.
- * <p>
- * In the case where there is only one sheet in the document, the results will be
- * exactly the same as most extractors.  Where there are multiple sheets, the results
- * will differ, but meaningful reimporting of the text document is not possible
- * anyway.
+ * Uses {@link http://tika.apache.org/ Apache Tika} and
+ *  {@link http://poi.apache.org/ Apache POI} to perform
+ *  conversions from Excel spreadsheets.
+ * <p>Will transform from Excel spreadsheets into Html,
+ *  Xml or Text (space or comma separated)
+ * <p>Handles all sheets in the file.
 * 
+ * TODO CSV Support
+ * 
+ * @author Nick Burch
 * @author Derek Hulley
 */
-public class PoiHssfContentTransformer extends AbstractContentTransformer2
+public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
 {
    /**
     * Error message to delegate to NodeInfoBean
     */
    public static final String WRONG_FORMAT_MESSAGE_ID = "transform.err.format_or_password";
-   
-    /**
-     * Windows carriage return line feed pair.
-     */
-    private static final String LINE_BREAK = "\r\n";
    private static Log logger = LogFactory.getLog(PoiHssfContentTransformer.class);
    
+    public PoiHssfContentTransformer() 
+    {
+       super(new String[] {
+             MimetypeMap.MIMETYPE_EXCEL
+       });
+    }
+    
+    @Override
+    protected Parser getParser() 
+    {
+       return new OfficeParser();
+    }
+    
    /**
-     * Currently the only transformation performed is that of text extraction from XLS documents.
+     * Can we do the requested transformation via Tika?
+     * We support transforming to HTML, XML, Text or CSV
     */
+    @Override
    public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
    {
-        if (!MimetypeMap.MIMETYPE_EXCEL.equals(sourceMimetype) ||
-                !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
-        {
-            // only support XLS -> Text
-            return false;
-        }
-        else
-        {
-            return true;
-        }
+       if(sourceMimeTypes.contains(sourceMimetype) && 
+             MimetypeMap.MIMETYPE_TEXT_CSV.equals(targetMimetype))
+       {
+          // Special case for CSV
+          return true;
+       }
+       
+       // Otherwise fall back on the default Tika rules
+       return super.isTransformable(sourceMimetype, targetMimetype, options);
    }
-
-    public void transformInternal(ContentReader reader, ContentWriter writer,  TransformationOptions options)
-            throws Exception
+    
+    @Override
+    protected ContentHandler getContentHandler(String targetMimeType, Writer output) 
+                   throws TransformerConfigurationException
    {
-        InputStream is = reader.getContentInputStream();
-        OutputStream os = writer.getContentOutputStream();
-        String encoding = writer.getEncoding();
-        try
-        {
-            // open the workbook
-            HSSFWorkbook workbook = new HSSFWorkbook(is);
-            // how many sheets are there?
-            int sheetCount = workbook.getNumberOfSheets();
-            // transform each sheet
-            for (int i = 0; i < sheetCount; i++)
-            {
-                HSSFSheet sheet = workbook.getSheetAt(i);
-                String sheetName = workbook.getSheetName(i);
-                writeSheet(os, sheet, encoding);
-                // write the sheet name
-                PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
-                PoiHssfContentTransformer.writeString(os, encoding, "End of sheet: " + sheetName, true);
-                PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
-                PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
-            }
-        }
-        catch (RecordFormatException ex)
-        {
-            // Catching specific exception to propagate it to NodeInfoBean
-            // to fix issue https://issues.alfresco.com/jira/browse/ETWOTWO-440
-           
-            logger.error(ex);
-            throw new TransformerInfoException(WRONG_FORMAT_MESSAGE_ID, ex);
-        }
-        finally
-        {
-            if (is != null)
-            {
-                try { is.close(); } catch (Throwable e) {}
-            }
-            if (os != null)
-            {
-                try { os.close(); } catch (Throwable e) {}
-            }
-        }
+       if(MimetypeMap.MIMETYPE_TEXT_CSV.equals(targetMimeType))
+       {
+          return new CsvContentHandler(output);
+       }
+       
+       // Otherwise use the normal Tika rules
+       return super.getContentHandler(targetMimeType, output);
    }
    
    /**
-     * Dumps the text from the sheet to the stream in CSV format
+     * A wrapper around the normal Tika BodyContentHandler,
+     *  which causes things to be CSV encoded rather than
+     *  tab separated
+     * TODO Get rid of the extra tabs that crop up
     */
-    private void writeSheet(OutputStream os, HSSFSheet sheet, String encoding) throws Exception
-    {
-        int rows = sheet.getLastRowNum();
-        // transform each row
-        for (int i = 0; i <= rows; i++)
-        {
-            HSSFRow row = sheet.getRow(i);
-            if (row != null)
-            {
-                writeRow(os, row, encoding);
-            }
-            // break between rows
-            if (i < rows)
-            {
-                PoiHssfContentTransformer.writeString(os, encoding, LINE_BREAK, false);
-            }
-        }
-    }
-    
-    private void writeRow(OutputStream os, HSSFRow row, String encoding) throws Exception
-    {
-        short firstCellNum = row.getFirstCellNum(); 
-        short lastCellNum = row.getLastCellNum();
-        // pad out to first cell
-        for (int i = 0; i < firstCellNum; i++)
-        {
-            PoiHssfContentTransformer.writeString(os, encoding, ",", false);   // CSV up to first cell
-        }
-        // write each cell
-        for (int i = 0; i <= lastCellNum; i++)
-        {
-            HSSFCell cell = row.getCell(i);
-            if (cell != null)
-            {
-            	int cellType = cell.getCellType();
+    protected static class CsvContentHandler extends BodyContentHandler {
+       private static final char[] comma = new char[]{ ',' };
+       private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+");
+       
+       private boolean inCell = false;
+       private boolean needsComma = false;
+       
+       protected CsvContentHandler(Writer output) {
+          super(output);
+       }

-            	StringBuilder sb = new StringBuilder(10);
-				switch (cellType)
-                {
-                    case HSSFCell.CELL_TYPE_BLANK:
-                        // ignore
-                        break;
-                    case HSSFCell.CELL_TYPE_BOOLEAN:
-                        sb.append(cell.getBooleanCellValue());
-                        break;
-                    case HSSFCell.CELL_TYPE_ERROR:
-                        sb.append("ERROR");
-                        break;
-                    case HSSFCell.CELL_TYPE_NUMERIC:
-                        sb.append(cell.getNumericCellValue());
-                        break;
-                    case HSSFCell.CELL_TYPE_STRING:
-                        sb.append(cell.getStringCellValue());
-                        break;
-                    case HSSFCell.CELL_TYPE_FORMULA:
-                    	final int formulaResultType = cell.getCachedFormulaResultType();
-                    	if (HSSFCell.CELL_TYPE_NUMERIC == formulaResultType)
-                    	{
-                    		sb.append(cell.getNumericCellValue());
-                    	}
-                    	else if (HSSFCell.CELL_TYPE_STRING == formulaResultType)
-                    	{
-                    		sb.append(cell.getStringCellValue());
-                    	}
-                    	else if (HSSFCell.CELL_TYPE_BOOLEAN == formulaResultType)
-                    	{
-                    		sb.append(cell.getBooleanCellValue());
-                    	}
-                    	else if (HSSFCell.CELL_TYPE_ERROR == formulaResultType)
-                    	{
-                    		sb.append(cell.getErrorCellValue());
-                    	}
-                    	else
-                    	{
-                    		throw new RuntimeException("Unknown formula result type: " + formulaResultType);
-                    	}
-                    	break;
-                    default:
-                        throw new RuntimeException("Unknown HSSF cell type: " + cell);
-                }
-                String data = sb.toString();
-                PoiHssfContentTransformer.writeString(os, encoding, data, true);
-            }
-            // comma separate if required
-            if (i < lastCellNum)
+       @Override
+       public void characters(char[] ch, int start, int length)
+            throws SAXException {
+         if(inCell) {
+            StringBuffer t = new StringBuffer(new String(ch,start,length));
+            
+            // Quote if not all numbers
+            if(all_nums.matcher(t).matches()) 
            {
-                PoiHssfContentTransformer.writeString(os, encoding, ",", false);
+               super.characters(ch, start, length);
            }
-        }
-    }
-    
-    /**
-     * Writes the given data to the stream using the encoding specified.  If the encoding
-     * is not given, the default <tt>String</tt> to <tt>byte[]</tt> conversion will be
-     * used.
-     * <p>
-     * The given data string will be escaped appropriately.
-     * 
-     * @param os the stream to write to
-     * @param encoding the encoding to use, or null if the default encoding is acceptable
-     * @param value the string to write
-     * @param isData true if the value represents a human-readable string, false if the
-     *      value represents formatting characters, separating characters, etc.
-     * @throws Exception
-     */
-    public static void writeString(OutputStream os, String encoding, String value, boolean isData) throws Exception
-    {
-        if (value == null)
-        {
-            // nothing to do
-            return;
-        }
-        int dataLength = value.length();
-        if (dataLength == 0)
-        {
-            // nothing to do
-            return;
-        }
-        
-        // escape the string
-        StringBuilder sb = new StringBuilder(dataLength + 5);   // slightly longer than the data
-        for (int i = 0; i < dataLength; i++)
-        {
-            char currentChar = value.charAt(i);
-            if (currentChar == '\"')         // inverted commas
+            else
            {
-                sb.append("\"");      // CSV escaping of inverted commas 
+               for(int i=t.length()-1; i>=0; i--) {
+                  if(t.charAt(i) == '\"') {
+                     // Double up double quotes
+                     t.insert(i, '\"');
+                     i--;
+                  }
+               }
+               t.insert(0, '\"');
+               t.append('\"');
+               char[] c = t.toString().toCharArray();
+               super.characters(c, 0, c.length);
            }
-            // append the char
-            sb.append(currentChar);
-        }
-        // enclose in inverted commas for safety
-        if (isData)
-        {
-            sb.insert(0, "\"");
-            sb.append("\"");
-        }
-        // escaping complete
-        value = sb.toString();
-        
-        byte[] bytes = null;
-        if (encoding == null)
-        {
-            // use default encoding
-            bytes = value.getBytes();
-        }
-        else
-        {
-            bytes = value.getBytes(encoding);
-        }
-        // write to the stream
-        os.write(bytes);
-        // done
+         } else {
+            super.characters(ch, start, length);
+         }
+       }
+
+       @Override
+       public void startElement(String uri, String localName, String name,
+            Attributes atts) throws SAXException {
+          if(localName.equals("td")) {
+             localName = "span";
+             name = "span";
+             
+             inCell = true;
+             if(needsComma) {
+                super.characters(comma, 0, 1);
+                needsComma = true;
+             }
+          }
+          super.startElement(uri, localName, name, atts);
+       }
+
+       @Override
+       public void endElement(String uri, String localName, String name)
+            throws SAXException {
+          if(localName.equals("td")) {
+             localName = "span";
+             name = "span";
+             
+             needsComma = true;
+             inCell = false;
+          }
+          if(localName.equals("tr")) {
+             needsComma = false;
+          }
+          super.endElement(uri, localName, name);
+       }
    }
 }
--- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
@@ -22,8 +22,10 @@ import java.io.File;
 import java.io.InputStream;

 import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.repo.content.filestore.FileContentReader;
 import org.alfresco.repo.content.filestore.FileContentWriter;
 import org.alfresco.service.cmr.repository.ContentReader;
+import org.alfresco.service.cmr.repository.ContentWriter;
 import org.alfresco.service.cmr.repository.TransformationOptions;
 import org.alfresco.util.TempFileProvider;

@@ -32,7 +34,7 @@ import org.alfresco.util.TempFileProvider;
 * 
 * @author Derek Hulley
 */
-public class PoiHssfContentTransformerTest extends AbstractContentTransformerTest
+public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformerTest
 {
    private ContentTransformer transformer;
    
@@ -56,12 +58,52 @@ public class PoiHssfContentTransformerTest extends AbstractContentTransformerTes
    {
        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_EXCEL, new TransformationOptions()));
        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_CSV, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
    }
    
-    /**
+    public void testCsvOutput() throws Exception
+    {
+       File sourceFile = AbstractContentTransformerTest.loadQuickTestFile("xls");
+       ContentReader sourceReader = new FileContentReader(sourceFile);
+
+       File targetFile = TempFileProvider.createTempFile(
+             getClass().getSimpleName() + "_" + getName() + "_xls_",
+             ".csv");
+       ContentWriter targetWriter = new FileContentWriter(targetFile);
+       
+       sourceReader.setMimetype(MimetypeMap.MIMETYPE_EXCEL);
+       targetWriter.setMimetype(MimetypeMap.MIMETYPE_TEXT_CSV);
+       transformer.transform(sourceReader, targetWriter);
+       
+       ContentReader targetReader = targetWriter.getReader();
+       String checkContent = targetReader.getContentString();
+       System.err.println(checkContent);
+    }
+    
+    @Override
+    protected void additionalContentCheck(String sourceMimetype,
+         String targetMimetype, String contents) {
+       if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
+          System.err.println(contents);
+       } else {
+          super.additionalContentCheck(sourceMimetype, targetMimetype, contents);
+       }
+    }
+
+    @Override
+    protected boolean isQuickPhraseExpected(String targetMimetype) {
+       if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
+          return true;
+       }
+       return super.isQuickPhraseExpected(targetMimetype);
+    }
+
+   /**
     * Tests a specific failure in the library
     */
-    public void xtestBugFixAR114() throws Exception
+    public void xxtestBugFixAR114() throws Exception
    {
        File tempFile = TempFileProvider.createTempFile(
                getClass().getSimpleName() + "_" + getName() + "_",
--- a/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformer.java
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.transform;
+
+import java.util.ArrayList;
+
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+
+/**
+ * A Content Extractor for XML, HTML and Text,
+ *  which makes use of the Apache Tika 
+ *  auto-detection to select the best parser
+ *  to process your document.
+ * This will be used for all files which Tika can
+ *  handle, but where no other more explicit
+ *  extractor is defined. 
+ * 
+ * @author Nick Burch
+ */
+public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
+{
+    /** 
+     * We support all the mimetypes that the Tika
+     *  auto-detect parser can handle, except for
+     *  Image, Audio and Video ones which don't 
+     *  make much sense
+     */
+    public static ArrayList<String> SUPPORTED_MIMETYPES;
+    static {
+       SUPPORTED_MIMETYPES = new ArrayList<String>();
+       AutoDetectParser p = new AutoDetectParser();
+       for(MediaType mt : p.getParsers().keySet()) {
+          if(mt.toString().startsWith("application/vnd.oasis.opendocument.formula")) {
+             // TODO Tika support for quick.odf, mimetype=application/vnd.oasis.opendocument.formula
+             // TODO Tika support for quick.otf, mimetype=application/vnd.oasis.opendocument.formula-template
+             continue;
+          }
+          if(mt.toString().startsWith("application/vnd.oasis.opendocument.graphics")) {
+             // TODO Tika support for quick.odg, mimetype=application/vnd.oasis.opendocument.graphics
+             // TODO Tika support for quick.otg, mimetype=application/vnd.oasis.opendocument.graphics-template
+             continue;
+          }
+          
+          if(mt.getType().equals("image") ||
+             mt.getType().equals("audio") ||
+             mt.getType().equals("video") ||
+             mt.toString().equals("application/zip") ||
+             mt.toString().equals("application/tar"))
+          {
+             // Skip these, as Tika mostly just does
+             //  metadata rather than content
+          }
+          else
+          {
+             
+             // Tika can probably do some useful text
+             SUPPORTED_MIMETYPES.add( mt.toString() );
+          }
+       }
+    }
+   
+    public TikaAutoContentTransformer()
+    {
+       super(SUPPORTED_MIMETYPES);
+    }
+    
+    /**
+     * Returns the Tika Auto-Detection
+     *  parser, which will try to 
+     *  process all documents that Tika
+     *  knows about
+     */
+    protected Parser getParser()
+    {
+       return new AutoDetectParser();
+    }
+}
--- a/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformerTest.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.transform;
+
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.TransformationOptions;
+
+/**
+ * Most of the work for testing the Tika Auto-Detect transformer
+ *  is automatically done by {@link AbstractContentTransformerTest}
+ *  
+ * @see org.alfresco.repo.content.transform.TikaAutoContentTransformer
+ * 
+ * @author Nick Burch
+ */
+public class TikaAutoContentTransformerTest extends TikaPoweredContentTransformerTest
+{
+    private ContentTransformer transformer;
+    
+    @Override
+    public void setUp() throws Exception
+    {
+        super.setUp();
+        
+        transformer = new TikaAutoContentTransformer();
+    }
+    
+    /**
+     * @return Returns the same transformer regardless - it is allowed
+     */
+    protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
+    {
+        return transformer;
+    }
+    
+    /**
+     * Ensure we picked up a mixture of content
+     *  types from Tika
+     */
+    public void testIsTransformable() throws Exception
+    {
+        // Excel (but this isn't normally used)
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_EXCEL, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+        
+        // Word
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+        
+        // PDF
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PDF, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+        
+        // Open Office
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+        
+        // We don't do images
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_IMAGE_JPEG, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+        // Ditto music
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_MP3, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+    }
+}
--- a/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformer.java
@@ -0,0 +1,192 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.transform;
+
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Arrays;
+import java.util.List;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.ContentReader;
+import org.alfresco.service.cmr.repository.ContentWriter;
+import org.alfresco.service.cmr.repository.TransformationOptions;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Provides helpful services for {@link org.alfresco.repo.content.transform.ContentTransformer}
+ *  implementations which are powered by Apache Tika.
+ * 
+ * To use Tika to transform some content into Text, Html or XML, create an 
+ *  implementation of this / use the Auto Detect transformer.
+ * 
+ * For now, all transformers are registered as regular, rather than explicit
+ *  transformations. This should allow you to register your own explicit
+ *  transformers and have them nicely take priority.
+ * 
+ * @author Nick Burch
+ */
+public abstract class TikaPoweredContentTransformer extends AbstractContentTransformer2
+{
+    private static final Log logger = LogFactory.getLog(TikaPoweredContentTransformer.class);
+    protected List<String> sourceMimeTypes;
+    
+    /**
+     * Windows carriage return line feed pair.
+     */
+    protected static final String LINE_BREAK = "\r\n";
+    public static final String WRONG_FORMAT_MESSAGE_ID = "transform.err.format_or_password";
+    
+    protected TikaPoweredContentTransformer(List<String> sourceMimeTypes) {
+       this.sourceMimeTypes = sourceMimeTypes;
+    }
+    protected TikaPoweredContentTransformer(String[] sourceMimeTypes) {
+       this(Arrays.asList(sourceMimeTypes));
+    }
+    
+    /**
+     * Returns the correct Tika Parser to process
+     *  the document.
+     * If you don't know which you want, use
+     *  {@link TikaAutoContentTransformer} which
+     *  makes use of the Tika auto-detection.
+     */
+    protected abstract Parser getParser();
+    
+    /**
+     * Can we do the requested transformation via Tika?
+     * We support transforming to HTML, XML or Text
+     */
+    public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
+    {
+       if(! sourceMimeTypes.contains(sourceMimetype)) 
+       {
+          // The source isn't one of ours
+          return false;
+       }
+       
+       if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype) ||
+             MimetypeMap.MIMETYPE_HTML.equals(targetMimetype) ||
+             MimetypeMap.MIMETYPE_XML.equals(targetMimetype))
+       {
+          // We can output to this
+          return true;
+       } 
+       else 
+       {
+          // We support the source, but not the target
+          return false;
+       }
+    }
+    
+    /**
+     * Returns an appropriate Tika ContentHandler for the
+     *  requested content type. Normally you'll let this
+     *  work as default, but if you need fine-grained
+     *  control of how the Tika events become text then
+     *  override and supply your own.
+     */
+    protected ContentHandler getContentHandler(String targetMimeType, Writer output) 
+                   throws TransformerConfigurationException
+    {
+       if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimeType)) 
+       {
+          return new BodyContentHandler(output);
+       }
+       
+       SAXTransformerFactory factory = (SAXTransformerFactory)
+             SAXTransformerFactory.newInstance();
+       TransformerHandler handler = factory.newTransformerHandler();
+       handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+       handler.setResult(new StreamResult(output));
+       
+       if(MimetypeMap.MIMETYPE_HTML.equals(targetMimeType))
+       {
+          handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+       }
+       else if(MimetypeMap.MIMETYPE_XML.equals(targetMimeType))
+       {
+          handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+       }
+       else
+       {
+          throw new TransformerInfoException(
+                WRONG_FORMAT_MESSAGE_ID,
+                new IllegalArgumentException("Requested target type " + targetMimeType + " not supported")
+          );
+       }
+       return handler;
+    }
+    
+    public void transformInternal(ContentReader reader, ContentWriter writer,  TransformationOptions options)
+    throws Exception
+    {
+       InputStream is = reader.getContentInputStream();
+       OutputStream os = writer.getContentOutputStream();
+       String encoding = writer.getEncoding();
+       String targetMimeType = writer.getMimetype();
+       
+       Writer ow = new OutputStreamWriter(os, encoding); 
+       
+       Parser parser = getParser();
+       Metadata metadata = new Metadata();
+       ParseContext context = new ParseContext();
+       
+       ContentHandler handler = getContentHandler(targetMimeType, ow);
+       if(handler == null)
+       {
+          throw new TransformerConfigurationException(
+                "Unable to create Tika Handler for configured output " + targetMimeType
+          );
+       }
+       
+       try {
+          parser.parse(is, handler, metadata, context);
+       } 
+       finally
+       {
+          if (is != null)
+          {
+              try { is.close(); } catch (Throwable e) {}
+          }
+          if (ow != null)
+          {
+              try { ow.close(); } catch (Throwable e) {}
+          }
+          if (os != null)
+          {
+              try { os.close(); } catch (Throwable e) {}
+          }
+      }
+    }
+}
--- a/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContentTransformerTest.java
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.transform;
+
+import org.alfresco.repo.content.MimetypeMap;
+
+/**
+ * Parent test for Tika powered transformer tests 
+ * 
+ * @author Nick Burch
+ */
+public abstract class TikaPoweredContentTransformerTest extends AbstractContentTransformerTest
+{
+   protected boolean isQuickPhraseExpected(String targetMimetype)
+   {
+       return (
+             targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN) ||
+             targetMimetype.equals(MimetypeMap.MIMETYPE_HTML) ||
+             targetMimetype.equals(MimetypeMap.MIMETYPE_XML)
+       );
+   }
+   protected boolean isQuickWordsExpected(String targetMimetype)
+   {
+       return (
+             targetMimetype.startsWith(StringExtractingContentTransformer.PREFIX_TEXT) ||
+             targetMimetype.equals(MimetypeMap.MIMETYPE_HTML) ||
+             targetMimetype.equals(MimetypeMap.MIMETYPE_XML)
+       );
+   }
+ 
+   /**
+    * Tests for html vs xml vs plain text
+    */
+   protected void additionalContentCheck(String sourceMimetype, String targetMimetype, String contents) 
+   {
+      if(targetMimetype.equals(MimetypeMap.MIMETYPE_XML)) 
+      {
+         // Look for header and footer to confirm it was translated
+         assertTrue(
+               "XML header not found",
+               contents.contains("<?xml version=")
+         );
+         assertTrue(
+               "XHTML header not found",
+               contents.contains("<html")
+         );
+         assertTrue(
+               "XHTML footer not found",
+               contents.contains("</html>")
+         );
+      }
+      else if(targetMimetype.equals(MimetypeMap.MIMETYPE_HTML))
+      {
+         // Look for header and footer to confirm it was translated
+         assertFalse(
+               "XML header found but shouldn't be there for HTML",
+               contents.contains("<?xml version=")
+         );
+         assertTrue(
+               "HTML header not found",
+               contents.contains("<html")
+         );
+         assertTrue(
+               "HTML footer not found",
+               contents.contains("</html>")
+         );
+      }
+      else if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN))
+      {
+         // Ensure it really is plain text not xml/html
+         assertFalse(
+               "XML header found but shouldn't be there for Plain Text",
+               contents.contains("<?xml version=")
+         );
+         assertFalse(
+               "XHTML header found but shouldn't be there for Plain Text",
+               contents.contains("<html")
+         );
+         assertFalse(
+               "XHTML footer found but shouldn't be there for Plain Text",
+               contents.contains("</html>")
+         );
+      }
+   }
+}