Tika content transformer support for OOXML office

Enable explicit Tika content transform for OOXML files
Allow the Excel transformer (which does CSV as well as text/html) to handle .xlsx as well as .xls
Also update the .doc parser test to ensure that the older word 6 and word 95 files are correctly handled too


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20781 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-06-23 15:51:03 +00:00
parent 228d111c56
commit 325f8e7923
10 changed files with 393 additions and 121 deletions

View File

@@ -377,6 +377,11 @@
class="org.alfresco.repo.content.transform.PoiContentTransformer"
parent="baseContentTransformer" />
<!-- This one handles the newer ooxml office formats, such as .xlsx and .docx -->
<bean id="transformer.OOXML"
class="org.alfresco.repo.content.transform.PoiOOXMLContentTransformer"
parent="baseContentTransformer" />
<bean id="transformer.TextMining"
class="org.alfresco.repo.content.transform.TextMiningContentTransformer"
parent="baseContentTransformer" >

View File

@@ -39,6 +39,7 @@ import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest;
import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest;
import org.alfresco.repo.content.transform.PoiContentTransformerTest;
import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest;
import org.alfresco.repo.content.transform.PoiOOXMLContentTransformerTest;
import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest;
import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
import org.alfresco.repo.content.transform.TextMiningContentTransformerTest;
@@ -107,6 +108,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
suite.addTestSuite(PdfBoxContentTransformerTest.class);
suite.addTestSuite(PoiContentTransformerTest.class);
suite.addTestSuite(PoiHssfContentTransformerTest.class);
suite.addTestSuite(PoiOOXMLContentTransformerTest.class);
suite.addTestSuite(RuntimeExecutableContentTransformerTest.class);
suite.addTestSuite(StringExtractingContentTransformerTest.class);
suite.addTestSuite(TextMiningContentTransformerTest.class);

View File

@@ -0,0 +1,95 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.util.HashSet;
import java.util.Set;
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* {@link http://tika.apache.org/ Apache Tika} assumes that
* you either know exactly what your content is, or that
* you'll leave it to auto-detection.
* Within Alfresco, we usually do know. However, from time
* to time, we don't know if we have one of the old or one
* of the new office files (eg .xls and .xlsx).
* This class allows automatically selects the appropriate
* old (OLE2) or new (OOXML) Tika parser as required.
*
* @author Nick Burch
*/
public class TikaOfficeDetectParser implements Parser {
private Parser ole2Parser = new OfficeParser();
private Parser ooxmlParser = new OOXMLParser();
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
Set<MediaType> types = new HashSet<MediaType>();
types.addAll(ole2Parser.getSupportedTypes(parseContext));
types.addAll(ooxmlParser.getSupportedTypes(parseContext));
return types;
}
public void parse(InputStream stream,
ContentHandler handler, Metadata metadata,
ParseContext parseContext) throws IOException, SAXException,
TikaException
{
PushbackInputStream inp = new PushbackInputStream(stream, 4);
byte[] initial4 = new byte[4];
IOUtils.readFully(inp, initial4);
inp.unread(initial4);
// Which is it?
if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3])
{
ooxmlParser.parse(inp, handler, metadata, parseContext);
}
else
{
ole2Parser.parse(inp, handler, metadata, parseContext);
}
}
/**
* @deprecated This method will be removed in Apache Tika 1.0.
*/
public void parse(InputStream stream,
ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException
{
parse(stream, handler, metadata, new ParseContext());
}
}

View File

@@ -111,14 +111,14 @@ public abstract class AbstractContentTransformerTest extends TestCase
* Helper method to load one of the "The quick brown fox" files from the
* classpath.
*
* @param extension the extension of the file required, e.g. <b>txt</b>
* @param the file required, eg <b>quick.txt</b>
* @return Returns a test resource loaded from the classpath or <tt>null</tt> if
* no resource could be found.
* @throws IOException
*/
public static File loadQuickTestFile(String extension) throws IOException
public static File loadNamedQuickTestFile(String quickname) throws IOException
{
URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/quick." + extension);
URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + quickname);
if (url == null)
{
return null;
@@ -130,6 +130,34 @@ public abstract class AbstractContentTransformerTest extends TestCase
}
return file;
}
/**
* Helper method to load one of the "The quick brown fox" files from the
* classpath.
*
* @param the file extension required, eg <b>txt</b> for the file quick.txt
* @return Returns a test resource loaded from the classpath or <tt>null</tt> if
* no resource could be found.
* @throws IOException
*/
public static File loadQuickTestFile(String extension) throws IOException
{
return loadNamedQuickTestFile("quick."+extension);
}
/**
* For the given mime type, returns one or more quick*
* files to be tested.
* By default this is just quick + the default extension.
* However, you can override this if you need special
* rules, eg quickOld.foo, quickMid.foo and quickNew.foo
* for differing versions of the file format.
*/
protected String[] getQuickFilenames(String sourceMimetype) {
String sourceExtension = mimetypeService.getExtension(sourceMimetype);
return new String[] {
"quick." + sourceExtension
};
}
/**
* Tests the full range of transformations available on the
@@ -160,9 +188,12 @@ public abstract class AbstractContentTransformerTest extends TestCase
for (String sourceMimetype : mimetypes)
{
// attempt to get a source file for each mimetype
String sourceExtension = mimetypeService.getExtension(sourceMimetype);
String[] quickFiles = getQuickFilenames(sourceMimetype);
sb.append(" Source Files: ").append(quickFiles).append("\n");
sb.append(" Source Extension: ").append(sourceExtension).append("\n");
for (String quickFile : quickFiles)
{
String sourceExtension = quickFile.substring(quickFile.lastIndexOf('.')+1);
// attempt to convert to every other mimetype
for (String targetMimetype : mimetypes)
@@ -194,7 +225,7 @@ public abstract class AbstractContentTransformerTest extends TestCase
sb.append(" <").append(transformer.getClass().getSimpleName()).append(">");
// is there a test file for this conversion?
File sourceFile = AbstractContentTransformerTest.loadQuickTestFile(sourceExtension);
File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(quickFile);
if (sourceFile == null)
{
sb.append(" <no source test file>\n");
@@ -276,6 +307,7 @@ public abstract class AbstractContentTransformerTest extends TestCase
}
}
}
}
// dump to file
File outputFile = TempFileProvider.createTempFile("AbstractContentTransformerTest-results-", ".txt");

View File

@@ -46,7 +46,7 @@ public class PoiContentTransformer extends TikaPoweredContentTransformer
public static ArrayList<String> SUPPORTED_MIMETYPES;
static {
SUPPORTED_MIMETYPES = new ArrayList<String>();
OfficeParser p = new OfficeParser();
Parser p = new OfficeParser();
for(MediaType mt : p.getSupportedTypes(null)) {
if(mt.toString().equals(MimetypeMap.MIMETYPE_EXCEL))
{

View File

@@ -24,11 +24,11 @@ import java.util.regex.Pattern;
import javax.xml.transform.TransformerConfigurationException;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.TikaOfficeDetectParser;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
@@ -56,14 +56,15 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
public PoiHssfContentTransformer()
{
super(new String[] {
MimetypeMap.MIMETYPE_EXCEL
MimetypeMap.MIMETYPE_EXCEL,
MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET
});
}
@Override
protected Parser getParser()
{
return new OfficeParser();
return new TikaOfficeDetectParser();
}
/**

View File

@@ -46,6 +46,13 @@ public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformer
transformer = new PoiHssfContentTransformer();
}
@Override
protected String[] getQuickFilenames(String sourceMimetype) {
return new String[] {
"quick.xls", "quick.xlsx"
};
}
/**
* @return Returns the same transformer regardless - it is allowed
*/

View File

@@ -0,0 +1,57 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import java.util.ArrayList;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
/**
* Uses {@link http://tika.apache.org/ Apache Tika} and
* {@link http://poi.apache.org/ Apache POI} to perform
* conversions from the newer OOXML Office documents.
*
* @author Nick Burch
*/
public class PoiOOXMLContentTransformer extends TikaPoweredContentTransformer
{
/**
* We support all the office mimetypes that the Tika
* office parser can handle
*/
public static ArrayList<String> SUPPORTED_MIMETYPES;
static {
SUPPORTED_MIMETYPES = new ArrayList<String>();
Parser p = new OOXMLParser();
for(MediaType mt : p.getSupportedTypes(null)) {
SUPPORTED_MIMETYPES.add( mt.toString() );
}
}
public PoiOOXMLContentTransformer() {
super(SUPPORTED_MIMETYPES);
}
@Override
protected Parser getParser() {
return new OOXMLParser();
}
}

View File

@@ -0,0 +1,66 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.TransformationOptions;
/**
* @see org.alfresco.repo.content.transform.PoiOOXMLContentTransformer
*
* @author Nick Burch
*/
public class PoiOOXMLContentTransformerTest extends AbstractContentTransformerTest
{
private ContentTransformer transformer;
@Override
public void setUp() throws Exception
{
super.setUp();
transformer = new PoiOOXMLContentTransformer();
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
{
return transformer;
}
public void testIsTransformable() throws Exception
{
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
}
}

View File

@@ -52,6 +52,13 @@ public class TextMiningContentTransformerTest extends AbstractContentTransformer
return transformer;
}
@Override
protected String[] getQuickFilenames(String sourceMimetype) {
return new String[] {
"quick.doc", "quick95.doc", "quick6.doc"
};
}
public void testIsTransformable() throws Exception
{
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));