mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Tika content transformer support for OOXML office
Enable explicit Tika content transform for OOXML files Allow the Excel transformer (which does CSV as well as text/html) to handle .xlsx as well as .xls Also update the .doc parser test to ensure that the older word 6 and word 95 files are correctly handled too git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20781 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -377,6 +377,11 @@
|
|||||||
class="org.alfresco.repo.content.transform.PoiContentTransformer"
|
class="org.alfresco.repo.content.transform.PoiContentTransformer"
|
||||||
parent="baseContentTransformer" />
|
parent="baseContentTransformer" />
|
||||||
|
|
||||||
|
<!-- This one handles the newer ooxml office formats, such as .xlsx and .docx -->
|
||||||
|
<bean id="transformer.OOXML"
|
||||||
|
class="org.alfresco.repo.content.transform.PoiOOXMLContentTransformer"
|
||||||
|
parent="baseContentTransformer" />
|
||||||
|
|
||||||
<bean id="transformer.TextMining"
|
<bean id="transformer.TextMining"
|
||||||
class="org.alfresco.repo.content.transform.TextMiningContentTransformer"
|
class="org.alfresco.repo.content.transform.TextMiningContentTransformer"
|
||||||
parent="baseContentTransformer" >
|
parent="baseContentTransformer" >
|
||||||
|
@@ -39,6 +39,7 @@ import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest;
|
|||||||
import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest;
|
import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest;
|
||||||
import org.alfresco.repo.content.transform.PoiContentTransformerTest;
|
import org.alfresco.repo.content.transform.PoiContentTransformerTest;
|
||||||
import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest;
|
import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest;
|
||||||
|
import org.alfresco.repo.content.transform.PoiOOXMLContentTransformerTest;
|
||||||
import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest;
|
import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest;
|
||||||
import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
|
import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
|
||||||
import org.alfresco.repo.content.transform.TextMiningContentTransformerTest;
|
import org.alfresco.repo.content.transform.TextMiningContentTransformerTest;
|
||||||
@@ -107,6 +108,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
|
|||||||
suite.addTestSuite(PdfBoxContentTransformerTest.class);
|
suite.addTestSuite(PdfBoxContentTransformerTest.class);
|
||||||
suite.addTestSuite(PoiContentTransformerTest.class);
|
suite.addTestSuite(PoiContentTransformerTest.class);
|
||||||
suite.addTestSuite(PoiHssfContentTransformerTest.class);
|
suite.addTestSuite(PoiHssfContentTransformerTest.class);
|
||||||
|
suite.addTestSuite(PoiOOXMLContentTransformerTest.class);
|
||||||
suite.addTestSuite(RuntimeExecutableContentTransformerTest.class);
|
suite.addTestSuite(RuntimeExecutableContentTransformerTest.class);
|
||||||
suite.addTestSuite(StringExtractingContentTransformerTest.class);
|
suite.addTestSuite(StringExtractingContentTransformerTest.class);
|
||||||
suite.addTestSuite(TextMiningContentTransformerTest.class);
|
suite.addTestSuite(TextMiningContentTransformerTest.class);
|
||||||
|
@@ -0,0 +1,95 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This file is part of Alfresco
|
||||||
|
*
|
||||||
|
* Alfresco is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Alfresco is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.PushbackInputStream;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.poi.poifs.common.POIFSConstants;
|
||||||
|
import org.apache.poi.util.IOUtils;
|
||||||
|
import org.apache.tika.exception.TikaException;
|
||||||
|
import org.apache.tika.metadata.Metadata;
|
||||||
|
import org.apache.tika.mime.MediaType;
|
||||||
|
import org.apache.tika.parser.ParseContext;
|
||||||
|
import org.apache.tika.parser.Parser;
|
||||||
|
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||||
|
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||||
|
import org.xml.sax.ContentHandler;
|
||||||
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link http://tika.apache.org/ Apache Tika} assumes that
|
||||||
|
* you either know exactly what your content is, or that
|
||||||
|
* you'll leave it to auto-detection.
|
||||||
|
* Within Alfresco, we usually do know. However, from time
|
||||||
|
* to time, we don't know if we have one of the old or one
|
||||||
|
* of the new office files (eg .xls and .xlsx).
|
||||||
|
* This class allows automatically selects the appropriate
|
||||||
|
* old (OLE2) or new (OOXML) Tika parser as required.
|
||||||
|
*
|
||||||
|
* @author Nick Burch
|
||||||
|
*/
|
||||||
|
public class TikaOfficeDetectParser implements Parser {
|
||||||
|
private Parser ole2Parser = new OfficeParser();
|
||||||
|
private Parser ooxmlParser = new OOXMLParser();
|
||||||
|
|
||||||
|
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
|
||||||
|
Set<MediaType> types = new HashSet<MediaType>();
|
||||||
|
types.addAll(ole2Parser.getSupportedTypes(parseContext));
|
||||||
|
types.addAll(ooxmlParser.getSupportedTypes(parseContext));
|
||||||
|
return types;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void parse(InputStream stream,
|
||||||
|
ContentHandler handler, Metadata metadata,
|
||||||
|
ParseContext parseContext) throws IOException, SAXException,
|
||||||
|
TikaException
|
||||||
|
{
|
||||||
|
PushbackInputStream inp = new PushbackInputStream(stream, 4);
|
||||||
|
byte[] initial4 = new byte[4];
|
||||||
|
IOUtils.readFully(inp, initial4);
|
||||||
|
inp.unread(initial4);
|
||||||
|
|
||||||
|
// Which is it?
|
||||||
|
if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
|
||||||
|
initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
|
||||||
|
initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
|
||||||
|
initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3])
|
||||||
|
{
|
||||||
|
ooxmlParser.parse(inp, handler, metadata, parseContext);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ole2Parser.parse(inp, handler, metadata, parseContext);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated This method will be removed in Apache Tika 1.0.
|
||||||
|
*/
|
||||||
|
public void parse(InputStream stream,
|
||||||
|
ContentHandler handler, Metadata metadata)
|
||||||
|
throws IOException, SAXException, TikaException
|
||||||
|
{
|
||||||
|
parse(stream, handler, metadata, new ParseContext());
|
||||||
|
}
|
||||||
|
}
|
@@ -111,14 +111,14 @@ public abstract class AbstractContentTransformerTest extends TestCase
|
|||||||
* Helper method to load one of the "The quick brown fox" files from the
|
* Helper method to load one of the "The quick brown fox" files from the
|
||||||
* classpath.
|
* classpath.
|
||||||
*
|
*
|
||||||
* @param extension the extension of the file required, e.g. <b>txt</b>
|
* @param the file required, eg <b>quick.txt</b>
|
||||||
* @return Returns a test resource loaded from the classpath or <tt>null</tt> if
|
* @return Returns a test resource loaded from the classpath or <tt>null</tt> if
|
||||||
* no resource could be found.
|
* no resource could be found.
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public static File loadQuickTestFile(String extension) throws IOException
|
public static File loadNamedQuickTestFile(String quickname) throws IOException
|
||||||
{
|
{
|
||||||
URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/quick." + extension);
|
URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + quickname);
|
||||||
if (url == null)
|
if (url == null)
|
||||||
{
|
{
|
||||||
return null;
|
return null;
|
||||||
@@ -130,6 +130,34 @@ public abstract class AbstractContentTransformerTest extends TestCase
|
|||||||
}
|
}
|
||||||
return file;
|
return file;
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Helper method to load one of the "The quick brown fox" files from the
|
||||||
|
* classpath.
|
||||||
|
*
|
||||||
|
* @param the file extension required, eg <b>txt</b> for the file quick.txt
|
||||||
|
* @return Returns a test resource loaded from the classpath or <tt>null</tt> if
|
||||||
|
* no resource could be found.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public static File loadQuickTestFile(String extension) throws IOException
|
||||||
|
{
|
||||||
|
return loadNamedQuickTestFile("quick."+extension);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For the given mime type, returns one or more quick*
|
||||||
|
* files to be tested.
|
||||||
|
* By default this is just quick + the default extension.
|
||||||
|
* However, you can override this if you need special
|
||||||
|
* rules, eg quickOld.foo, quickMid.foo and quickNew.foo
|
||||||
|
* for differing versions of the file format.
|
||||||
|
*/
|
||||||
|
protected String[] getQuickFilenames(String sourceMimetype) {
|
||||||
|
String sourceExtension = mimetypeService.getExtension(sourceMimetype);
|
||||||
|
return new String[] {
|
||||||
|
"quick." + sourceExtension
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests the full range of transformations available on the
|
* Tests the full range of transformations available on the
|
||||||
@@ -160,120 +188,124 @@ public abstract class AbstractContentTransformerTest extends TestCase
|
|||||||
for (String sourceMimetype : mimetypes)
|
for (String sourceMimetype : mimetypes)
|
||||||
{
|
{
|
||||||
// attempt to get a source file for each mimetype
|
// attempt to get a source file for each mimetype
|
||||||
String sourceExtension = mimetypeService.getExtension(sourceMimetype);
|
String[] quickFiles = getQuickFilenames(sourceMimetype);
|
||||||
|
sb.append(" Source Files: ").append(quickFiles).append("\n");
|
||||||
|
|
||||||
sb.append(" Source Extension: ").append(sourceExtension).append("\n");
|
for (String quickFile : quickFiles)
|
||||||
|
|
||||||
// attempt to convert to every other mimetype
|
|
||||||
for (String targetMimetype : mimetypes)
|
|
||||||
{
|
{
|
||||||
if (sourceMimetype.equals(targetMimetype))
|
String sourceExtension = quickFile.substring(quickFile.lastIndexOf('.')+1);
|
||||||
{
|
|
||||||
// Don't test like-to-like transformations
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
ContentWriter targetWriter = null;
|
|
||||||
// construct a reader onto the source file
|
|
||||||
String targetExtension = mimetypeService.getExtension(targetMimetype);
|
|
||||||
|
|
||||||
// must we test the transformation?
|
// attempt to convert to every other mimetype
|
||||||
ContentTransformer transformer = getTransformer(sourceMimetype, targetMimetype);
|
for (String targetMimetype : mimetypes)
|
||||||
if (transformer == null || transformer.isTransformable(sourceMimetype, targetMimetype, null) == false)
|
{
|
||||||
{
|
if (sourceMimetype.equals(targetMimetype))
|
||||||
// no transformer
|
{
|
||||||
continue;
|
// Don't test like-to-like transformations
|
||||||
}
|
continue;
|
||||||
|
}
|
||||||
|
ContentWriter targetWriter = null;
|
||||||
|
// construct a reader onto the source file
|
||||||
|
String targetExtension = mimetypeService.getExtension(targetMimetype);
|
||||||
|
|
||||||
if (isTransformationExcluded(sourceExtension, targetExtension))
|
// must we test the transformation?
|
||||||
{
|
ContentTransformer transformer = getTransformer(sourceMimetype, targetMimetype);
|
||||||
continue;
|
if (transformer == null || transformer.isTransformable(sourceMimetype, targetMimetype, null) == false)
|
||||||
}
|
{
|
||||||
|
// no transformer
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// dump
|
if (isTransformationExcluded(sourceExtension, targetExtension))
|
||||||
sb.append(" Target Extension: ").append(targetExtension);
|
{
|
||||||
sb.append(" <").append(transformer.getClass().getSimpleName()).append(">");
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// is there a test file for this conversion?
|
// dump
|
||||||
File sourceFile = AbstractContentTransformerTest.loadQuickTestFile(sourceExtension);
|
sb.append(" Target Extension: ").append(targetExtension);
|
||||||
if (sourceFile == null)
|
sb.append(" <").append(transformer.getClass().getSimpleName()).append(">");
|
||||||
{
|
|
||||||
sb.append(" <no source test file>\n");
|
|
||||||
continue; // no test file available for that extension
|
|
||||||
}
|
|
||||||
ContentReader sourceReader = new FileContentReader(sourceFile);
|
|
||||||
|
|
||||||
// perform the transformation several times so that we get a good idea of performance
|
// is there a test file for this conversion?
|
||||||
int count = 0;
|
File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(quickFile);
|
||||||
long before = System.currentTimeMillis();
|
if (sourceFile == null)
|
||||||
Set<String> transformerClasses = new HashSet<String>(2);
|
{
|
||||||
for (int i = 0; i < 5; i++)
|
sb.append(" <no source test file>\n");
|
||||||
{
|
continue; // no test file available for that extension
|
||||||
// get the transformer repeatedly as it might be different each time around
|
}
|
||||||
transformer = getTransformer(sourceMimetype, targetMimetype);
|
ContentReader sourceReader = new FileContentReader(sourceFile);
|
||||||
// must we report on this class?
|
|
||||||
if (!transformerClasses.contains(transformer.getClass().getName()))
|
|
||||||
{
|
|
||||||
transformerClasses.add(transformer.getClass().getName());
|
|
||||||
sb.append(" <").append(transformer.getClass().getSimpleName()).append(">");
|
|
||||||
}
|
|
||||||
|
|
||||||
// make a writer for the target file
|
// perform the transformation several times so that we get a good idea of performance
|
||||||
File targetFile = TempFileProvider.createTempFile(
|
int count = 0;
|
||||||
getClass().getSimpleName() + "_" + getName() + "_" + sourceExtension + "_",
|
long before = System.currentTimeMillis();
|
||||||
"." + targetExtension);
|
Set<String> transformerClasses = new HashSet<String>(2);
|
||||||
targetWriter = new FileContentWriter(targetFile);
|
for (int i = 0; i < 5; i++)
|
||||||
|
{
|
||||||
|
// get the transformer repeatedly as it might be different each time around
|
||||||
|
transformer = getTransformer(sourceMimetype, targetMimetype);
|
||||||
|
// must we report on this class?
|
||||||
|
if (!transformerClasses.contains(transformer.getClass().getName()))
|
||||||
|
{
|
||||||
|
transformerClasses.add(transformer.getClass().getName());
|
||||||
|
sb.append(" <").append(transformer.getClass().getSimpleName()).append(">");
|
||||||
|
}
|
||||||
|
|
||||||
// do the transformation
|
// make a writer for the target file
|
||||||
sourceReader.setMimetype(sourceMimetype);
|
File targetFile = TempFileProvider.createTempFile(
|
||||||
targetWriter.setMimetype(targetMimetype);
|
getClass().getSimpleName() + "_" + getName() + "_" + sourceExtension + "_",
|
||||||
transformer.transform(sourceReader.getReader(), targetWriter);
|
"." + targetExtension);
|
||||||
|
targetWriter = new FileContentWriter(targetFile);
|
||||||
|
|
||||||
// if the target format is any type of text, then it must contain the 'quick' phrase
|
// do the transformation
|
||||||
if (isQuickPhraseExpected(targetMimetype))
|
sourceReader.setMimetype(sourceMimetype);
|
||||||
{
|
targetWriter.setMimetype(targetMimetype);
|
||||||
ContentReader targetReader = targetWriter.getReader();
|
transformer.transform(sourceReader.getReader(), targetWriter);
|
||||||
String checkContent = targetReader.getContentString();
|
|
||||||
assertTrue("Quick phrase not present in document converted to text: \n" +
|
|
||||||
" transformer: " + transformer + "\n" +
|
|
||||||
" source: " + sourceReader + "\n" +
|
|
||||||
" target: " + targetWriter,
|
|
||||||
checkContent.contains(QUICK_CONTENT));
|
|
||||||
|
|
||||||
// Let subclasses do extra checks if they want
|
// if the target format is any type of text, then it must contain the 'quick' phrase
|
||||||
additionalContentCheck(sourceMimetype, targetMimetype, checkContent);
|
if (isQuickPhraseExpected(targetMimetype))
|
||||||
}
|
{
|
||||||
else if (isQuickWordsExpected(targetMimetype))
|
ContentReader targetReader = targetWriter.getReader();
|
||||||
{
|
String checkContent = targetReader.getContentString();
|
||||||
ContentReader targetReader = targetWriter.getReader();
|
assertTrue("Quick phrase not present in document converted to text: \n" +
|
||||||
String checkContent = targetReader.getContentString();
|
" transformer: " + transformer + "\n" +
|
||||||
// essentially check that FTS indexing can use the conversion properly
|
" source: " + sourceReader + "\n" +
|
||||||
for (int word = 0; word < QUICK_WORDS.length; word++)
|
" target: " + targetWriter,
|
||||||
{
|
checkContent.contains(QUICK_CONTENT));
|
||||||
assertTrue("Quick phrase word not present in document converted to text: \n" +
|
|
||||||
" transformer: " + transformer + "\n" +
|
|
||||||
" source: " + sourceReader + "\n" +
|
|
||||||
" target: " + targetWriter + "\n" +
|
|
||||||
" word: " + word,
|
|
||||||
checkContent.contains(QUICK_WORDS[word]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// increment count
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
long after = System.currentTimeMillis();
|
|
||||||
double average = (double) (after - before) / (double) count;
|
|
||||||
|
|
||||||
// dump
|
// Let subclasses do extra checks if they want
|
||||||
sb.append(String.format(" average %10.0f ms", average)).append("\n");
|
additionalContentCheck(sourceMimetype, targetMimetype, checkContent);
|
||||||
|
}
|
||||||
|
else if (isQuickWordsExpected(targetMimetype))
|
||||||
|
{
|
||||||
|
ContentReader targetReader = targetWriter.getReader();
|
||||||
|
String checkContent = targetReader.getContentString();
|
||||||
|
// essentially check that FTS indexing can use the conversion properly
|
||||||
|
for (int word = 0; word < QUICK_WORDS.length; word++)
|
||||||
|
{
|
||||||
|
assertTrue("Quick phrase word not present in document converted to text: \n" +
|
||||||
|
" transformer: " + transformer + "\n" +
|
||||||
|
" source: " + sourceReader + "\n" +
|
||||||
|
" target: " + targetWriter + "\n" +
|
||||||
|
" word: " + word,
|
||||||
|
checkContent.contains(QUICK_WORDS[word]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// increment count
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
long after = System.currentTimeMillis();
|
||||||
|
double average = (double) (after - before) / (double) count;
|
||||||
|
|
||||||
if (logger.isDebugEnabled())
|
// dump
|
||||||
{
|
sb.append(String.format(" average %10.0f ms", average)).append("\n");
|
||||||
logger.debug("Transformation performed " + count + " time: " +
|
|
||||||
sourceMimetype + " --> " + targetMimetype + "\n" +
|
if (logger.isDebugEnabled())
|
||||||
" source: " + sourceReader + "\n" +
|
{
|
||||||
" target: " + targetWriter + "\n" +
|
logger.debug("Transformation performed " + count + " time: " +
|
||||||
" transformer: " + getTransformer(sourceMimetype, targetMimetype));
|
sourceMimetype + " --> " + targetMimetype + "\n" +
|
||||||
}
|
" source: " + sourceReader + "\n" +
|
||||||
|
" target: " + targetWriter + "\n" +
|
||||||
|
" transformer: " + getTransformer(sourceMimetype, targetMimetype));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -46,7 +46,7 @@ public class PoiContentTransformer extends TikaPoweredContentTransformer
|
|||||||
public static ArrayList<String> SUPPORTED_MIMETYPES;
|
public static ArrayList<String> SUPPORTED_MIMETYPES;
|
||||||
static {
|
static {
|
||||||
SUPPORTED_MIMETYPES = new ArrayList<String>();
|
SUPPORTED_MIMETYPES = new ArrayList<String>();
|
||||||
OfficeParser p = new OfficeParser();
|
Parser p = new OfficeParser();
|
||||||
for(MediaType mt : p.getSupportedTypes(null)) {
|
for(MediaType mt : p.getSupportedTypes(null)) {
|
||||||
if(mt.toString().equals(MimetypeMap.MIMETYPE_EXCEL))
|
if(mt.toString().equals(MimetypeMap.MIMETYPE_EXCEL))
|
||||||
{
|
{
|
||||||
|
@@ -24,11 +24,11 @@ import java.util.regex.Pattern;
|
|||||||
import javax.xml.transform.TransformerConfigurationException;
|
import javax.xml.transform.TransformerConfigurationException;
|
||||||
|
|
||||||
import org.alfresco.repo.content.MimetypeMap;
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
|
import org.alfresco.repo.content.TikaOfficeDetectParser;
|
||||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.tika.parser.Parser;
|
import org.apache.tika.parser.Parser;
|
||||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
|
||||||
import org.apache.tika.sax.BodyContentHandler;
|
import org.apache.tika.sax.BodyContentHandler;
|
||||||
import org.xml.sax.Attributes;
|
import org.xml.sax.Attributes;
|
||||||
import org.xml.sax.ContentHandler;
|
import org.xml.sax.ContentHandler;
|
||||||
@@ -56,14 +56,15 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
|
|||||||
public PoiHssfContentTransformer()
|
public PoiHssfContentTransformer()
|
||||||
{
|
{
|
||||||
super(new String[] {
|
super(new String[] {
|
||||||
MimetypeMap.MIMETYPE_EXCEL
|
MimetypeMap.MIMETYPE_EXCEL,
|
||||||
|
MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Parser getParser()
|
protected Parser getParser()
|
||||||
{
|
{
|
||||||
return new OfficeParser();
|
return new TikaOfficeDetectParser();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@@ -46,7 +46,14 @@ public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformer
|
|||||||
transformer = new PoiHssfContentTransformer();
|
transformer = new PoiHssfContentTransformer();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
@Override
|
||||||
|
protected String[] getQuickFilenames(String sourceMimetype) {
|
||||||
|
return new String[] {
|
||||||
|
"quick.xls", "quick.xlsx"
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
* @return Returns the same transformer regardless - it is allowed
|
* @return Returns the same transformer regardless - it is allowed
|
||||||
*/
|
*/
|
||||||
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
|
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
|
||||||
|
@@ -0,0 +1,57 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This file is part of Alfresco
|
||||||
|
*
|
||||||
|
* Alfresco is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Alfresco is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
import org.apache.tika.mime.MediaType;
|
||||||
|
import org.apache.tika.parser.Parser;
|
||||||
|
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Uses {@link http://tika.apache.org/ Apache Tika} and
|
||||||
|
* {@link http://poi.apache.org/ Apache POI} to perform
|
||||||
|
* conversions from the newer OOXML Office documents.
|
||||||
|
*
|
||||||
|
* @author Nick Burch
|
||||||
|
*/
|
||||||
|
public class PoiOOXMLContentTransformer extends TikaPoweredContentTransformer
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* We support all the office mimetypes that the Tika
|
||||||
|
* office parser can handle
|
||||||
|
*/
|
||||||
|
public static ArrayList<String> SUPPORTED_MIMETYPES;
|
||||||
|
static {
|
||||||
|
SUPPORTED_MIMETYPES = new ArrayList<String>();
|
||||||
|
Parser p = new OOXMLParser();
|
||||||
|
for(MediaType mt : p.getSupportedTypes(null)) {
|
||||||
|
SUPPORTED_MIMETYPES.add( mt.toString() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public PoiOOXMLContentTransformer() {
|
||||||
|
super(SUPPORTED_MIMETYPES);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Parser getParser() {
|
||||||
|
return new OOXMLParser();
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,66 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This file is part of Alfresco
|
||||||
|
*
|
||||||
|
* Alfresco is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Alfresco is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
|
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see org.alfresco.repo.content.transform.PoiOOXMLContentTransformer
|
||||||
|
*
|
||||||
|
* @author Nick Burch
|
||||||
|
*/
|
||||||
|
public class PoiOOXMLContentTransformerTest extends AbstractContentTransformerTest
|
||||||
|
{
|
||||||
|
private ContentTransformer transformer;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception
|
||||||
|
{
|
||||||
|
super.setUp();
|
||||||
|
|
||||||
|
transformer = new PoiOOXMLContentTransformer();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return Returns the same transformer regardless - it is allowed
|
||||||
|
*/
|
||||||
|
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
|
||||||
|
{
|
||||||
|
return transformer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testIsTransformable() throws Exception
|
||||||
|
{
|
||||||
|
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||||
|
|
||||||
|
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||||
|
|
||||||
|
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||||
|
}
|
||||||
|
}
|
@@ -52,6 +52,13 @@ public class TextMiningContentTransformerTest extends AbstractContentTransformer
|
|||||||
return transformer;
|
return transformer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected String[] getQuickFilenames(String sourceMimetype) {
|
||||||
|
return new String[] {
|
||||||
|
"quick.doc", "quick95.doc", "quick6.doc"
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
public void testIsTransformable() throws Exception
|
public void testIsTransformable() throws Exception
|
||||||
{
|
{
|
||||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));
|
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));
|
||||||
|
Reference in New Issue
Block a user