Tika content transformer support for OOXML office

Enable explicit Tika content transform for OOXML files Allow the Excel transformer (which does CSV as well as text/html) to handle .xlsx as well as .xls Also update the .doc parser test to ensure that the older word 6 and word 95 files are correctly handled too git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20781 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-07-31 17:39:05 +00:00 · 2010-06-23 15:51:03 +00:00
parent 228d111c56
commit 325f8e7923
10 changed files with 393 additions and 121 deletions
--- a/config/alfresco/content-services-context.xml
+++ b/config/alfresco/content-services-context.xml
@@ -377,6 +377,11 @@
         class="org.alfresco.repo.content.transform.PoiContentTransformer"
         parent="baseContentTransformer" />

+   <!-- This one handles the newer ooxml office formats, such as .xlsx and .docx -->
+   <bean id="transformer.OOXML"
+         class="org.alfresco.repo.content.transform.PoiOOXMLContentTransformer"
+         parent="baseContentTransformer" />
+
   <bean id="transformer.TextMining"
         class="org.alfresco.repo.content.transform.TextMiningContentTransformer"
         parent="baseContentTransformer" >
--- a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java
+++ b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java
@@ -39,6 +39,7 @@ import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest;
 import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest;
 import org.alfresco.repo.content.transform.PoiContentTransformerTest;
 import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest;
+import org.alfresco.repo.content.transform.PoiOOXMLContentTransformerTest;
 import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest;
 import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
 import org.alfresco.repo.content.transform.TextMiningContentTransformerTest;
@@ -107,6 +108,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
       suite.addTestSuite(PdfBoxContentTransformerTest.class);
       suite.addTestSuite(PoiContentTransformerTest.class);
       suite.addTestSuite(PoiHssfContentTransformerTest.class);
+       suite.addTestSuite(PoiOOXMLContentTransformerTest.class);
       suite.addTestSuite(RuntimeExecutableContentTransformerTest.class);
       suite.addTestSuite(StringExtractingContentTransformerTest.class);
       suite.addTestSuite(TextMiningContentTransformerTest.class);
--- a/source/java/org/alfresco/repo/content/TikaOfficeDetectParser.java
+++ b/source/java/org/alfresco/repo/content/TikaOfficeDetectParser.java
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * {@link http://tika.apache.org/ Apache Tika} assumes that
+ *  you either know exactly what your content is, or that
+ *  you'll leave it to auto-detection.
+ * Within Alfresco, we usually do know. However, from time
+ *  to time, we don't know if we have one of the old or one
+ *  of the new office files (eg .xls and .xlsx).
+ * This class allows automatically selects the appropriate
+ *  old (OLE2) or new (OOXML) Tika parser as required.
+ *    
+ * @author Nick Burch
+ */
+public class TikaOfficeDetectParser implements Parser {
+   private Parser ole2Parser = new OfficeParser();
+   private Parser ooxmlParser = new OOXMLParser();
+
+   public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+      Set<MediaType> types = new HashSet<MediaType>();
+      types.addAll(ole2Parser.getSupportedTypes(parseContext));
+      types.addAll(ooxmlParser.getSupportedTypes(parseContext));
+      return types;
+   }
+
+   public void parse(InputStream stream,
+         ContentHandler handler, Metadata metadata,
+         ParseContext parseContext) throws IOException, SAXException,
+         TikaException 
+   {
+      PushbackInputStream inp = new PushbackInputStream(stream, 4);
+      byte[] initial4 = new byte[4];
+      IOUtils.readFully(inp, initial4);
+      inp.unread(initial4);
+      
+      // Which is it?
+      if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
+         initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
+         initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
+         initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3])
+      {
+         ooxmlParser.parse(inp, handler, metadata, parseContext);
+      }
+      else
+      {
+         ole2Parser.parse(inp, handler, metadata, parseContext);
+      }
+   }
+
+   /**
+    * @deprecated This method will be removed in Apache Tika 1.0.
+    */
+   public void parse(InputStream stream,
+         ContentHandler handler, Metadata metadata)
+         throws IOException, SAXException, TikaException 
+   {
+      parse(stream, handler, metadata, new ParseContext());
+   }
+}
--- a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java
@@ -111,14 +111,14 @@ public abstract class AbstractContentTransformerTest extends TestCase
     * Helper method to load one of the "The quick brown fox" files from the
     * classpath.
     * 
-     * @param extension the extension of the file required, e.g. <b>txt</b>
+     * @param the file required, eg <b>quick.txt</b>
     * @return Returns a test resource loaded from the classpath or <tt>null</tt> if
     *      no resource could be found.
     * @throws IOException
     */
-    public static File loadQuickTestFile(String extension) throws IOException
+    public static File loadNamedQuickTestFile(String quickname) throws IOException
    {
-        URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/quick." + extension);
+        URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + quickname);
        if (url == null)
        {
            return null;
@@ -130,6 +130,34 @@ public abstract class AbstractContentTransformerTest extends TestCase
        }
        return file;
    }
+    /**
+     * Helper method to load one of the "The quick brown fox" files from the
+     * classpath.
+     * 
+     * @param the file extension required, eg <b>txt</b> for the file quick.txt
+     * @return Returns a test resource loaded from the classpath or <tt>null</tt> if
+     *      no resource could be found.
+     * @throws IOException
+     */
+    public static File loadQuickTestFile(String extension) throws IOException
+    {
+       return loadNamedQuickTestFile("quick."+extension);
+    }
+    
+    /**
+     * For the given mime type, returns one or more quick*
+     *  files to be tested.
+     * By default this is just quick + the default extension.
+     * However, you can override this if you need special
+     *  rules, eg quickOld.foo, quickMid.foo and quickNew.foo
+     *  for differing versions of the file format.
+     */
+    protected String[] getQuickFilenames(String sourceMimetype) {
+       String sourceExtension = mimetypeService.getExtension(sourceMimetype);
+       return new String[] {
+             "quick." + sourceExtension
+       };
+    }

    /**
     * Tests the full range of transformations available on the
@@ -160,9 +188,12 @@ public abstract class AbstractContentTransformerTest extends TestCase
        for (String sourceMimetype : mimetypes)
        {
            // attempt to get a source file for each mimetype
-            String sourceExtension = mimetypeService.getExtension(sourceMimetype);
+            String[] quickFiles = getQuickFilenames(sourceMimetype);
+            sb.append("   Source Files: ").append(quickFiles).append("\n");

-            sb.append("   Source Extension: ").append(sourceExtension).append("\n");
+            for (String quickFile : quickFiles)
+            {
+               String sourceExtension = quickFile.substring(quickFile.lastIndexOf('.')+1);
               
               // attempt to convert to every other mimetype
               for (String targetMimetype : mimetypes)
@@ -194,7 +225,7 @@ public abstract class AbstractContentTransformerTest extends TestCase
                   sb.append(" <").append(transformer.getClass().getSimpleName()).append(">");
   
                   // is there a test file for this conversion?
-                File sourceFile = AbstractContentTransformerTest.loadQuickTestFile(sourceExtension);
+                   File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(quickFile);
                   if (sourceFile == null)
                   {
                       sb.append(" <no source test file>\n");
@@ -276,6 +307,7 @@ public abstract class AbstractContentTransformerTest extends TestCase
                   }
               }
            }
+        }
        
        // dump to file
        File outputFile = TempFileProvider.createTempFile("AbstractContentTransformerTest-results-", ".txt");
--- a/source/java/org/alfresco/repo/content/transform/PoiContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiContentTransformer.java
@@ -46,7 +46,7 @@ public class PoiContentTransformer extends TikaPoweredContentTransformer
   public static ArrayList<String> SUPPORTED_MIMETYPES;
   static {
      SUPPORTED_MIMETYPES = new ArrayList<String>();
-      OfficeParser p = new OfficeParser();
+      Parser p = new OfficeParser();
      for(MediaType mt : p.getSupportedTypes(null)) {
         if(mt.toString().equals(MimetypeMap.MIMETYPE_EXCEL))
         {
--- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
@@ -24,11 +24,11 @@ import java.util.regex.Pattern;
 import javax.xml.transform.TransformerConfigurationException;

 import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.repo.content.TikaOfficeDetectParser;
 import org.alfresco.service.cmr.repository.TransformationOptions;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
@@ -56,14 +56,15 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
    public PoiHssfContentTransformer() 
    {
       super(new String[] {
-             MimetypeMap.MIMETYPE_EXCEL
+             MimetypeMap.MIMETYPE_EXCEL,
+             MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET
       });
    }
    
    @Override
    protected Parser getParser() 
    {
-       return new OfficeParser();
+       return new TikaOfficeDetectParser();
    }
    
    /**
--- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
@@ -46,6 +46,13 @@ public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformer
        transformer = new PoiHssfContentTransformer();
    }
    
+    @Override
+    protected String[] getQuickFilenames(String sourceMimetype) {
+      return new String[] {
+            "quick.xls", "quick.xlsx"
+      };
+    }
+
   /**
     * @return Returns the same transformer regardless - it is allowed
     */
--- a/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformer.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.transform;
+
+import java.util.ArrayList;
+
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+
+/**
+ * Uses {@link http://tika.apache.org/ Apache Tika} and
+ *  {@link http://poi.apache.org/ Apache POI} to perform
+ *  conversions from the newer OOXML Office documents.
+ *
+ * @author Nick Burch
+ */
+public class PoiOOXMLContentTransformer extends TikaPoweredContentTransformer
+{
+   /** 
+    * We support all the office mimetypes that the Tika
+    *  office parser can handle
+    */
+   public static ArrayList<String> SUPPORTED_MIMETYPES;
+   static {
+      SUPPORTED_MIMETYPES = new ArrayList<String>();
+      Parser p = new OOXMLParser();
+      for(MediaType mt : p.getSupportedTypes(null)) {
+         SUPPORTED_MIMETYPES.add( mt.toString() );
+      }
+   }
+    
+    public PoiOOXMLContentTransformer() {
+       super(SUPPORTED_MIMETYPES);
+    }
+
+    @Override
+    protected Parser getParser() {
+       return new OOXMLParser();
+    }
+}
--- a/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformerTest.java
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.transform;
+
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.TransformationOptions;
+
+/**
+ * @see org.alfresco.repo.content.transform.PoiOOXMLContentTransformer
+ * 
+ * @author Nick Burch
+ */
+public class PoiOOXMLContentTransformerTest extends AbstractContentTransformerTest
+{
+    private ContentTransformer transformer;
+    
+    @Override
+    public void setUp() throws Exception
+    {
+        super.setUp();
+        
+        transformer = new PoiOOXMLContentTransformer();
+    }
+    
+    /**
+     * @return Returns the same transformer regardless - it is allowed
+     */
+    protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
+    {
+        return transformer;
+    }
+
+    public void testIsTransformable() throws Exception
+    {
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+        
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+        
+        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+        assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+    }
+}
--- a/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformerTest.java
@@ -52,6 +52,13 @@ public class TextMiningContentTransformerTest extends AbstractContentTransformer
        return transformer;
    }
    
+    @Override
+    protected String[] getQuickFilenames(String sourceMimetype) {
+      return new String[] {
+            "quick.doc", "quick95.doc", "quick6.doc"
+      };
+    }
+    
    public void testIsTransformable() throws Exception
    {
        assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));