Tika content transformer support for OOXML office

Enable explicit Tika content transform for OOXML files Allow the Excel transformer (which does CSV as well as text/html) to handle .xlsx as well as .xls Also update the .doc parser test to ensure that the older word 6 and word 95 files are correctly handled too git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20781 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-10-15 15:02:20 +00:00 · 2010-06-23 15:51:03 +00:00
parent 228d111c56
commit 325f8e7923
10 changed files with 393 additions and 121 deletions
--- a/config/alfresco/content-services-context.xml
+++ b/config/alfresco/content-services-context.xml
@@ -377,6 +377,11 @@
         class="org.alfresco.repo.content.transform.PoiContentTransformer"
         parent="baseContentTransformer" />

+   <!-- This one handles the newer ooxml office formats, such as .xlsx and .docx -->
+   <bean id="transformer.OOXML"
+         class="org.alfresco.repo.content.transform.PoiOOXMLContentTransformer"
+         parent="baseContentTransformer" />
+
   <bean id="transformer.TextMining"
         class="org.alfresco.repo.content.transform.TextMiningContentTransformer"
         parent="baseContentTransformer" >
--- a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java
+++ b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java
@@ -39,6 +39,7 @@ import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest;
 import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest;
 import org.alfresco.repo.content.transform.PoiContentTransformerTest;
 import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest;
+import org.alfresco.repo.content.transform.PoiOOXMLContentTransformerTest;
 import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest;
 import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
 import org.alfresco.repo.content.transform.TextMiningContentTransformerTest;
@@ -107,6 +108,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
       suite.addTestSuite(PdfBoxContentTransformerTest.class);
       suite.addTestSuite(PoiContentTransformerTest.class);
       suite.addTestSuite(PoiHssfContentTransformerTest.class);
+       suite.addTestSuite(PoiOOXMLContentTransformerTest.class);
       suite.addTestSuite(RuntimeExecutableContentTransformerTest.class);
       suite.addTestSuite(StringExtractingContentTransformerTest.class);
       suite.addTestSuite(TextMiningContentTransformerTest.class);
--- a/source/java/org/alfresco/repo/content/TikaOfficeDetectParser.java
+++ b/source/java/org/alfresco/repo/content/TikaOfficeDetectParser.java
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * {@link http://tika.apache.org/ Apache Tika} assumes that
+ *  you either know exactly what your content is, or that
+ *  you'll leave it to auto-detection.
+ * Within Alfresco, we usually do know. However, from time
+ *  to time, we don't know if we have one of the old or one
+ *  of the new office files (eg .xls and .xlsx).
+ * This class allows automatically selects the appropriate
+ *  old (OLE2) or new (OOXML) Tika parser as required.
+ *    
+ * @author Nick Burch
+ */
+public class TikaOfficeDetectParser implements Parser {
+   private Parser ole2Parser = new OfficeParser();
+   private Parser ooxmlParser = new OOXMLParser();
+
+   public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+      Set<MediaType> types = new HashSet<MediaType>();
+      types.addAll(ole2Parser.getSupportedTypes(parseContext));
+      types.addAll(ooxmlParser.getSupportedTypes(parseContext));
+      return types;
+   }
+
+   public void parse(InputStream stream,
+         ContentHandler handler, Metadata metadata,
+         ParseContext parseContext) throws IOException, SAXException,
+         TikaException 
+   {
+      PushbackInputStream inp = new PushbackInputStream(stream, 4);
+      byte[] initial4 = new byte[4];
+      IOUtils.readFully(inp, initial4);
+      inp.unread(initial4);
+      
+      // Which is it?
+      if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
+         initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
+         initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
+         initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3])
+      {
+         ooxmlParser.parse(inp, handler, metadata, parseContext);
+      }
+      else
+      {
+         ole2Parser.parse(inp, handler, metadata, parseContext);
+      }
+   }
+
+   /**
+    * @deprecated This method will be removed in Apache Tika 1.0.
+    */
+   public void parse(InputStream stream,
+         ContentHandler handler, Metadata metadata)
+         throws IOException, SAXException, TikaException 
+   {
+      parse(stream, handler, metadata, new ParseContext());
+   }
+}
--- a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java
@@ -111,14 +111,14 @@ public abstract class AbstractContentTransformerTest extends TestCase
     * Helper method to load one of the "The quick brown fox" files from the
     * classpath.
     * 
-     * @param extension the extension of the file required, e.g. <b>txt</b>
+     * @param the file required, eg <b>quick.txt</b>
     * @return Returns a test resource loaded from the classpath or <tt>null</tt> if
     *      no resource could be found.
     * @throws IOException
     */
-    public static File loadQuickTestFile(String extension) throws IOException
+    public static File loadNamedQuickTestFile(String quickname) throws IOException
    {
-        URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/quick." + extension);
+        URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + quickname);
        if (url == null)
        {
            return null;
@@ -130,6 +130,34 @@ public abstract class AbstractContentTransformerTest extends TestCase
        }
        return file;
    }
+    /**
+     * Helper method to load one of the "The quick brown fox" files from the
+     * classpath.
+     * 
+     * @param the file extension required, eg <b>txt</b> for the file quick.txt
+     * @return Returns a test resource loaded from the classpath or <tt>null</tt> if
+     *      no resource could be found.
+     * @throws IOException
+     */
+    public static File loadQuickTestFile(String extension) throws IOException
+    {
+       return loadNamedQuickTestFile("quick."+extension);
+    }
+    
+    /**
+     * For the given mime type, returns one or more quick*
+     *  files to be tested.
+     * By default this is just quick + the default extension.
+     * However, you can override this if you need special
+     *  rules, eg quickOld.foo, quickMid.foo and quickNew.foo
+     *  for differing versions of the file format.
+     */
+    protected String[] getQuickFilenames(String sourceMimetype) {
+       String sourceExtension = mimetypeService.getExtension(sourceMimetype);
+       return new String[] {
+             "quick." + sourceExtension
+       };
+    }

    /**
     * Tests the full range of transformations available on the
@@ -160,120 +188,124 @@ public abstract class AbstractContentTransformerTest extends TestCase
        for (String sourceMimetype : mimetypes)
        {
            // attempt to get a source file for each mimetype
-            String sourceExtension = mimetypeService.getExtension(sourceMimetype);
+            String[] quickFiles = getQuickFilenames(sourceMimetype);
+            sb.append("   Source Files: ").append(quickFiles).append("\n");

-            sb.append("   Source Extension: ").append(sourceExtension).append("\n");
-            
-            // attempt to convert to every other mimetype
-            for (String targetMimetype : mimetypes)
+            for (String quickFile : quickFiles)
            {
-            	if (sourceMimetype.equals(targetMimetype))
-            	{
-            		// Don't test like-to-like transformations
-            		continue;
-            	}
-                ContentWriter targetWriter = null;
-                // construct a reader onto the source file
-                String targetExtension = mimetypeService.getExtension(targetMimetype);
+               String sourceExtension = quickFile.substring(quickFile.lastIndexOf('.')+1);
               
-                // must we test the transformation?
-                ContentTransformer transformer = getTransformer(sourceMimetype, targetMimetype);
-                if (transformer == null || transformer.isTransformable(sourceMimetype, targetMimetype, null) == false)
-                {
-                    // no transformer
-                    continue;
-                }
+               // attempt to convert to every other mimetype
+               for (String targetMimetype : mimetypes)
+               {
+               	if (sourceMimetype.equals(targetMimetype))
+               	{
+               		// Don't test like-to-like transformations
+               		continue;
+               	}
+                   ContentWriter targetWriter = null;
+                   // construct a reader onto the source file
+                   String targetExtension = mimetypeService.getExtension(targetMimetype);
                   
-                if (isTransformationExcluded(sourceExtension, targetExtension))
-                {
-                	continue;
-                }
+                   // must we test the transformation?
+                   ContentTransformer transformer = getTransformer(sourceMimetype, targetMimetype);
+                   if (transformer == null || transformer.isTransformable(sourceMimetype, targetMimetype, null) == false)
+                   {
+                       // no transformer
+                       continue;
+                   }
                   
-                // dump
-                sb.append("      Target Extension: ").append(targetExtension);
-                sb.append(" <").append(transformer.getClass().getSimpleName()).append(">");
+                   if (isTransformationExcluded(sourceExtension, targetExtension))
+                   {
+                   	continue;
+                   }
   
-                // is there a test file for this conversion?
-                File sourceFile = AbstractContentTransformerTest.loadQuickTestFile(sourceExtension);
-                if (sourceFile == null)
-                {
-                    sb.append(" <no source test file>\n");
-                    continue;  // no test file available for that extension
-                }
-                ContentReader sourceReader = new FileContentReader(sourceFile);
+                   // dump
+                   sb.append("      Target Extension: ").append(targetExtension);
+                   sb.append(" <").append(transformer.getClass().getSimpleName()).append(">");
   
-                // perform the transformation several times so that we get a good idea of performance
-                int count = 0;
-                long before = System.currentTimeMillis();
-                Set<String> transformerClasses = new HashSet<String>(2);
-                for (int i = 0; i < 5; i++)
-                {
-                    // get the transformer repeatedly as it might be different each time around
-                    transformer = getTransformer(sourceMimetype, targetMimetype);
-                    // must we report on this class?
-                    if (!transformerClasses.contains(transformer.getClass().getName()))
-                    {
-                        transformerClasses.add(transformer.getClass().getName());
-                        sb.append(" <").append(transformer.getClass().getSimpleName()).append(">");
-                    }
+                   // is there a test file for this conversion?
+                   File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(quickFile);
+                   if (sourceFile == null)
+                   {
+                       sb.append(" <no source test file>\n");
+                       continue;  // no test file available for that extension
+                   }
+                   ContentReader sourceReader = new FileContentReader(sourceFile);
   
-                    // make a writer for the target file
-                    File targetFile = TempFileProvider.createTempFile(
-                            getClass().getSimpleName() + "_" + getName() + "_" + sourceExtension + "_",
-                            "." + targetExtension);
-                    targetWriter = new FileContentWriter(targetFile);
+                   // perform the transformation several times so that we get a good idea of performance
+                   int count = 0;
+                   long before = System.currentTimeMillis();
+                   Set<String> transformerClasses = new HashSet<String>(2);
+                   for (int i = 0; i < 5; i++)
+                   {
+                       // get the transformer repeatedly as it might be different each time around
+                       transformer = getTransformer(sourceMimetype, targetMimetype);
+                       // must we report on this class?
+                       if (!transformerClasses.contains(transformer.getClass().getName()))
+                       {
+                           transformerClasses.add(transformer.getClass().getName());
+                           sb.append(" <").append(transformer.getClass().getSimpleName()).append(">");
+                       }
   
-                    // do the transformation
-                    sourceReader.setMimetype(sourceMimetype);
-                    targetWriter.setMimetype(targetMimetype);
-                    transformer.transform(sourceReader.getReader(), targetWriter);
+                       // make a writer for the target file
+                       File targetFile = TempFileProvider.createTempFile(
+                               getClass().getSimpleName() + "_" + getName() + "_" + sourceExtension + "_",
+                               "." + targetExtension);
+                       targetWriter = new FileContentWriter(targetFile);
                       
-                    // if the target format is any type of text, then it must contain the 'quick' phrase
-                    if (isQuickPhraseExpected(targetMimetype))
-                    {
-                        ContentReader targetReader = targetWriter.getReader();
-                        String checkContent = targetReader.getContentString();
-                        assertTrue("Quick phrase not present in document converted to text: \n" +
-                                "   transformer: " + transformer + "\n" +
-                                "   source: " + sourceReader + "\n" +
-                                "   target: " + targetWriter,
-                                checkContent.contains(QUICK_CONTENT));
+                       // do the transformation
+                       sourceReader.setMimetype(sourceMimetype);
+                       targetWriter.setMimetype(targetMimetype);
+                       transformer.transform(sourceReader.getReader(), targetWriter);
                       
-                        // Let subclasses do extra checks if they want
-                        additionalContentCheck(sourceMimetype, targetMimetype, checkContent);
-                    }
-                    else if (isQuickWordsExpected(targetMimetype))
-                    {
-                        ContentReader targetReader = targetWriter.getReader();
-                        String checkContent = targetReader.getContentString();
-                        // essentially check that FTS indexing can use the conversion properly
-                        for (int word = 0; word < QUICK_WORDS.length; word++)
-                        {
-                            assertTrue("Quick phrase word not present in document converted to text: \n" +
-                                    "   transformer: " + transformer + "\n" +
-                                    "   source: " + sourceReader + "\n" +
-                                    "   target: " + targetWriter + "\n" +
-                                    "   word: " + word,
-                                    checkContent.contains(QUICK_WORDS[word]));
-                        }
-                    }
-                    // increment count
-                    count++;
-                }
-                long after = System.currentTimeMillis();
-                double average = (double) (after - before) / (double) count;
+                       // if the target format is any type of text, then it must contain the 'quick' phrase
+                       if (isQuickPhraseExpected(targetMimetype))
+                       {
+                           ContentReader targetReader = targetWriter.getReader();
+                           String checkContent = targetReader.getContentString();
+                           assertTrue("Quick phrase not present in document converted to text: \n" +
+                                   "   transformer: " + transformer + "\n" +
+                                   "   source: " + sourceReader + "\n" +
+                                   "   target: " + targetWriter,
+                                   checkContent.contains(QUICK_CONTENT));
                           
-                // dump
-                sb.append(String.format(" average %10.0f ms", average)).append("\n");
+                           // Let subclasses do extra checks if they want
+                           additionalContentCheck(sourceMimetype, targetMimetype, checkContent);
+                       }
+                       else if (isQuickWordsExpected(targetMimetype))
+                       {
+                           ContentReader targetReader = targetWriter.getReader();
+                           String checkContent = targetReader.getContentString();
+                           // essentially check that FTS indexing can use the conversion properly
+                           for (int word = 0; word < QUICK_WORDS.length; word++)
+                           {
+                               assertTrue("Quick phrase word not present in document converted to text: \n" +
+                                       "   transformer: " + transformer + "\n" +
+                                       "   source: " + sourceReader + "\n" +
+                                       "   target: " + targetWriter + "\n" +
+                                       "   word: " + word,
+                                       checkContent.contains(QUICK_WORDS[word]));
+                           }
+                       }
+                       // increment count
+                       count++;
+                   }
+                   long after = System.currentTimeMillis();
+                   double average = (double) (after - before) / (double) count;
                   
-                if (logger.isDebugEnabled())
-                {
-                    logger.debug("Transformation performed " + count + " time: " +
-                            sourceMimetype + " --> " + targetMimetype + "\n" +
-                            "   source: " + sourceReader + "\n" +
-                            "   target: " + targetWriter + "\n" +
-                            "   transformer: " + getTransformer(sourceMimetype, targetMimetype));
-                }
+                   // dump
+                   sb.append(String.format(" average %10.0f ms", average)).append("\n");
+                   
+                   if (logger.isDebugEnabled())
+                   {
+                       logger.debug("Transformation performed " + count + " time: " +
+                               sourceMimetype + " --> " + targetMimetype + "\n" +
+                               "   source: " + sourceReader + "\n" +
+                               "   target: " + targetWriter + "\n" +
+                               "   transformer: " + getTransformer(sourceMimetype, targetMimetype));
+                   }
+               }
            }
        }
        
--- a/source/java/org/alfresco/repo/content/transform/PoiContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiContentTransformer.java
@@ -46,7 +46,7 @@ public class PoiContentTransformer extends TikaPoweredContentTransformer
   public static ArrayList<String> SUPPORTED_MIMETYPES;
   static {
      SUPPORTED_MIMETYPES = new ArrayList<String>();
-      OfficeParser p = new OfficeParser();
+      Parser p = new OfficeParser();
      for(MediaType mt : p.getSupportedTypes(null)) {
         if(mt.toString().equals(MimetypeMap.MIMETYPE_EXCEL))
         {
--- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
@@ -24,11 +24,11 @@ import java.util.regex.Pattern;
 import javax.xml.transform.TransformerConfigurationException;

 import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.repo.content.TikaOfficeDetectParser;
 import org.alfresco.service.cmr.repository.TransformationOptions;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
@@ -56,14 +56,15 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
    public PoiHssfContentTransformer() 
    {
       super(new String[] {
-             MimetypeMap.MIMETYPE_EXCEL
+             MimetypeMap.MIMETYPE_EXCEL,
+             MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET
       });
    }
    
    @Override
    protected Parser getParser() 
    {
-       return new OfficeParser();
+       return new TikaOfficeDetectParser();
    }
    
    /**
--- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
@@ -46,7 +46,14 @@ public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformer
        transformer = new PoiHssfContentTransformer();
    }
    
-    /**
+    @Override
+    protected String[] getQuickFilenames(String sourceMimetype) {
+      return new String[] {
+            "quick.xls", "quick.xlsx"
+      };
+    }
+
+   /**
     * @return Returns the same transformer regardless - it is allowed
     */
    protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
--- a/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformer.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.transform;
+
+import java.util.ArrayList;
+
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+
+/**
+ * Uses {@link http://tika.apache.org/ Apache Tika} and
+ *  {@link http://poi.apache.org/ Apache POI} to perform
+ *  conversions from the newer OOXML Office documents.
+ *
+ * @author Nick Burch
+ */
+public class PoiOOXMLContentTransformer extends TikaPoweredContentTransformer
+{
+   /** 
+    * We support all the office mimetypes that the Tika
+    *  office parser can handle
+    */
+   public static ArrayList<String> SUPPORTED_MIMETYPES;
+   static {
+      SUPPORTED_MIMETYPES = new ArrayList<String>();
+      Parser p = new OOXMLParser();
+      for(MediaType mt : p.getSupportedTypes(null)) {
+         SUPPORTED_MIMETYPES.add( mt.toString() );
+      }
+   }
+    
+    public PoiOOXMLContentTransformer() {
+       super(SUPPORTED_MIMETYPES);
+    }
+
+    @Override
+    protected Parser getParser() {
+       return new OOXMLParser();
+    }
+}
--- a/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformerTest.java
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.transform;
+
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.TransformationOptions;
+
+/**
+ * @see org.alfresco.repo.content.transform.PoiOOXMLContentTransformer
+ * 
+ * @author Nick Burch
+ */
+public class PoiOOXMLContentTransformerTest extends AbstractContentTransformerTest
+{
+    private ContentTransformer transformer;
+    
+    @Override
+    public void setUp() throws Exception
+    {
+        super.setUp();
+        
+        transformer = new PoiOOXMLContentTransformer();
+    }
+    
+    /**
+     * @return Returns the same transformer regardless - it is allowed
+     */
+    protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
+    {
+        return transformer;
+    }
+
+    public void testIsTransformable() throws Exception
+    {
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+        
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+        
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+    }
+}
--- a/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformerTest.java
@@ -52,6 +52,13 @@ public class TextMiningContentTransformerTest extends AbstractContentTransformer
        return transformer;
    }
    
+    @Override
+    protected String[] getQuickFilenames(String sourceMimetype) {
+      return new String[] {
+            "quick.doc", "quick95.doc", "quick6.doc"
+      };
+    }
+    
    public void testIsTransformable() throws Exception
    {
        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));