From 325f8e792312f05b654cc763f3d0435bd4180e4b Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Wed, 23 Jun 2010 15:51:03 +0000 Subject: [PATCH] Tika content transformer support for OOXML office Enable explicit Tika content transform for OOXML files Allow the Excel transformer (which does CSV as well as text/html) to handle .xlsx as well as .xls Also update the .doc parser test to ensure that the older word 6 and word 95 files are correctly handled too git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20781 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- config/alfresco/content-services-context.xml | 5 + .../ContentMinimalContextTestSuite.java | 2 + .../repo/content/TikaOfficeDetectParser.java | 95 +++++++ .../AbstractContentTransformerTest.java | 264 ++++++++++-------- .../transform/PoiContentTransformer.java | 2 +- .../transform/PoiHssfContentTransformer.java | 7 +- .../PoiHssfContentTransformerTest.java | 9 +- .../transform/PoiOOXMLContentTransformer.java | 57 ++++ .../PoiOOXMLContentTransformerTest.java | 66 +++++ .../TextMiningContentTransformerTest.java | 7 + 10 files changed, 393 insertions(+), 121 deletions(-) create mode 100644 source/java/org/alfresco/repo/content/TikaOfficeDetectParser.java create mode 100644 source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformer.java create mode 100644 source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformerTest.java diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml index 2b8bbb536e..67dd4d0d3c 100644 --- a/config/alfresco/content-services-context.xml +++ b/config/alfresco/content-services-context.xml @@ -377,6 +377,11 @@ class="org.alfresco.repo.content.transform.PoiContentTransformer" parent="baseContentTransformer" /> + + + diff --git a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java index 04b0e26567..b7cfebe05d 100644 --- a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java +++ b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java @@ -39,6 +39,7 @@ import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest; import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest; import org.alfresco.repo.content.transform.PoiContentTransformerTest; import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest; +import org.alfresco.repo.content.transform.PoiOOXMLContentTransformerTest; import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest; import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest; import org.alfresco.repo.content.transform.TextMiningContentTransformerTest; @@ -107,6 +108,7 @@ public class ContentMinimalContextTestSuite extends TestSuite suite.addTestSuite(PdfBoxContentTransformerTest.class); suite.addTestSuite(PoiContentTransformerTest.class); suite.addTestSuite(PoiHssfContentTransformerTest.class); + suite.addTestSuite(PoiOOXMLContentTransformerTest.class); suite.addTestSuite(RuntimeExecutableContentTransformerTest.class); suite.addTestSuite(StringExtractingContentTransformerTest.class); suite.addTestSuite(TextMiningContentTransformerTest.class); diff --git a/source/java/org/alfresco/repo/content/TikaOfficeDetectParser.java b/source/java/org/alfresco/repo/content/TikaOfficeDetectParser.java new file mode 100644 index 0000000000..7c323325bd --- /dev/null +++ b/source/java/org/alfresco/repo/content/TikaOfficeDetectParser.java @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; +import java.util.HashSet; +import java.util.Set; + +import org.apache.poi.poifs.common.POIFSConstants; +import org.apache.poi.util.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.OfficeParser; +import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * {@link http://tika.apache.org/ Apache Tika} assumes that + * you either know exactly what your content is, or that + * you'll leave it to auto-detection. + * Within Alfresco, we usually do know. However, from time + * to time, we don't know if we have one of the old or one + * of the new office files (eg .xls and .xlsx). + * This class allows automatically selects the appropriate + * old (OLE2) or new (OOXML) Tika parser as required. + * + * @author Nick Burch + */ +public class TikaOfficeDetectParser implements Parser { + private Parser ole2Parser = new OfficeParser(); + private Parser ooxmlParser = new OOXMLParser(); + + public Set getSupportedTypes(ParseContext parseContext) { + Set types = new HashSet(); + types.addAll(ole2Parser.getSupportedTypes(parseContext)); + types.addAll(ooxmlParser.getSupportedTypes(parseContext)); + return types; + } + + public void parse(InputStream stream, + ContentHandler handler, Metadata metadata, + ParseContext parseContext) throws IOException, SAXException, + TikaException + { + PushbackInputStream inp = new PushbackInputStream(stream, 4); + byte[] initial4 = new byte[4]; + IOUtils.readFully(inp, initial4); + inp.unread(initial4); + + // Which is it? + if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] && + initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] && + initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] && + initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3]) + { + ooxmlParser.parse(inp, handler, metadata, parseContext); + } + else + { + ole2Parser.parse(inp, handler, metadata, parseContext); + } + } + + /** + * @deprecated This method will be removed in Apache Tika 1.0. + */ + public void parse(InputStream stream, + ContentHandler handler, Metadata metadata) + throws IOException, SAXException, TikaException + { + parse(stream, handler, metadata, new ParseContext()); + } +} diff --git a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java index c6c02c3f99..1c58812d4c 100644 --- a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java +++ b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformerTest.java @@ -111,14 +111,14 @@ public abstract class AbstractContentTransformerTest extends TestCase * Helper method to load one of the "The quick brown fox" files from the * classpath. * - * @param extension the extension of the file required, e.g. txt + * @param the file required, eg quick.txt * @return Returns a test resource loaded from the classpath or null if * no resource could be found. * @throws IOException */ - public static File loadQuickTestFile(String extension) throws IOException + public static File loadNamedQuickTestFile(String quickname) throws IOException { - URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/quick." + extension); + URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + quickname); if (url == null) { return null; @@ -130,6 +130,34 @@ public abstract class AbstractContentTransformerTest extends TestCase } return file; } + /** + * Helper method to load one of the "The quick brown fox" files from the + * classpath. + * + * @param the file extension required, eg txt for the file quick.txt + * @return Returns a test resource loaded from the classpath or null if + * no resource could be found. + * @throws IOException + */ + public static File loadQuickTestFile(String extension) throws IOException + { + return loadNamedQuickTestFile("quick."+extension); + } + + /** + * For the given mime type, returns one or more quick* + * files to be tested. + * By default this is just quick + the default extension. + * However, you can override this if you need special + * rules, eg quickOld.foo, quickMid.foo and quickNew.foo + * for differing versions of the file format. + */ + protected String[] getQuickFilenames(String sourceMimetype) { + String sourceExtension = mimetypeService.getExtension(sourceMimetype); + return new String[] { + "quick." + sourceExtension + }; + } /** * Tests the full range of transformations available on the @@ -160,120 +188,124 @@ public abstract class AbstractContentTransformerTest extends TestCase for (String sourceMimetype : mimetypes) { // attempt to get a source file for each mimetype - String sourceExtension = mimetypeService.getExtension(sourceMimetype); - - sb.append(" Source Extension: ").append(sourceExtension).append("\n"); - - // attempt to convert to every other mimetype - for (String targetMimetype : mimetypes) + String[] quickFiles = getQuickFilenames(sourceMimetype); + sb.append(" Source Files: ").append(quickFiles).append("\n"); + + for (String quickFile : quickFiles) { - if (sourceMimetype.equals(targetMimetype)) - { - // Don't test like-to-like transformations - continue; - } - ContentWriter targetWriter = null; - // construct a reader onto the source file - String targetExtension = mimetypeService.getExtension(targetMimetype); - - // must we test the transformation? - ContentTransformer transformer = getTransformer(sourceMimetype, targetMimetype); - if (transformer == null || transformer.isTransformable(sourceMimetype, targetMimetype, null) == false) - { - // no transformer - continue; - } - - if (isTransformationExcluded(sourceExtension, targetExtension)) - { - continue; - } - - // dump - sb.append(" Target Extension: ").append(targetExtension); - sb.append(" <").append(transformer.getClass().getSimpleName()).append(">"); - - // is there a test file for this conversion? - File sourceFile = AbstractContentTransformerTest.loadQuickTestFile(sourceExtension); - if (sourceFile == null) - { - sb.append(" \n"); - continue; // no test file available for that extension - } - ContentReader sourceReader = new FileContentReader(sourceFile); - - // perform the transformation several times so that we get a good idea of performance - int count = 0; - long before = System.currentTimeMillis(); - Set transformerClasses = new HashSet(2); - for (int i = 0; i < 5; i++) - { - // get the transformer repeatedly as it might be different each time around - transformer = getTransformer(sourceMimetype, targetMimetype); - // must we report on this class? - if (!transformerClasses.contains(transformer.getClass().getName())) - { - transformerClasses.add(transformer.getClass().getName()); - sb.append(" <").append(transformer.getClass().getSimpleName()).append(">"); - } - - // make a writer for the target file - File targetFile = TempFileProvider.createTempFile( - getClass().getSimpleName() + "_" + getName() + "_" + sourceExtension + "_", - "." + targetExtension); - targetWriter = new FileContentWriter(targetFile); - - // do the transformation - sourceReader.setMimetype(sourceMimetype); - targetWriter.setMimetype(targetMimetype); - transformer.transform(sourceReader.getReader(), targetWriter); - - // if the target format is any type of text, then it must contain the 'quick' phrase - if (isQuickPhraseExpected(targetMimetype)) - { - ContentReader targetReader = targetWriter.getReader(); - String checkContent = targetReader.getContentString(); - assertTrue("Quick phrase not present in document converted to text: \n" + - " transformer: " + transformer + "\n" + - " source: " + sourceReader + "\n" + - " target: " + targetWriter, - checkContent.contains(QUICK_CONTENT)); - - // Let subclasses do extra checks if they want - additionalContentCheck(sourceMimetype, targetMimetype, checkContent); - } - else if (isQuickWordsExpected(targetMimetype)) - { - ContentReader targetReader = targetWriter.getReader(); - String checkContent = targetReader.getContentString(); - // essentially check that FTS indexing can use the conversion properly - for (int word = 0; word < QUICK_WORDS.length; word++) - { - assertTrue("Quick phrase word not present in document converted to text: \n" + - " transformer: " + transformer + "\n" + - " source: " + sourceReader + "\n" + - " target: " + targetWriter + "\n" + - " word: " + word, - checkContent.contains(QUICK_WORDS[word])); - } - } - // increment count - count++; - } - long after = System.currentTimeMillis(); - double average = (double) (after - before) / (double) count; - - // dump - sb.append(String.format(" average %10.0f ms", average)).append("\n"); - - if (logger.isDebugEnabled()) - { - logger.debug("Transformation performed " + count + " time: " + - sourceMimetype + " --> " + targetMimetype + "\n" + - " source: " + sourceReader + "\n" + - " target: " + targetWriter + "\n" + - " transformer: " + getTransformer(sourceMimetype, targetMimetype)); - } + String sourceExtension = quickFile.substring(quickFile.lastIndexOf('.')+1); + + // attempt to convert to every other mimetype + for (String targetMimetype : mimetypes) + { + if (sourceMimetype.equals(targetMimetype)) + { + // Don't test like-to-like transformations + continue; + } + ContentWriter targetWriter = null; + // construct a reader onto the source file + String targetExtension = mimetypeService.getExtension(targetMimetype); + + // must we test the transformation? + ContentTransformer transformer = getTransformer(sourceMimetype, targetMimetype); + if (transformer == null || transformer.isTransformable(sourceMimetype, targetMimetype, null) == false) + { + // no transformer + continue; + } + + if (isTransformationExcluded(sourceExtension, targetExtension)) + { + continue; + } + + // dump + sb.append(" Target Extension: ").append(targetExtension); + sb.append(" <").append(transformer.getClass().getSimpleName()).append(">"); + + // is there a test file for this conversion? + File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(quickFile); + if (sourceFile == null) + { + sb.append(" \n"); + continue; // no test file available for that extension + } + ContentReader sourceReader = new FileContentReader(sourceFile); + + // perform the transformation several times so that we get a good idea of performance + int count = 0; + long before = System.currentTimeMillis(); + Set transformerClasses = new HashSet(2); + for (int i = 0; i < 5; i++) + { + // get the transformer repeatedly as it might be different each time around + transformer = getTransformer(sourceMimetype, targetMimetype); + // must we report on this class? + if (!transformerClasses.contains(transformer.getClass().getName())) + { + transformerClasses.add(transformer.getClass().getName()); + sb.append(" <").append(transformer.getClass().getSimpleName()).append(">"); + } + + // make a writer for the target file + File targetFile = TempFileProvider.createTempFile( + getClass().getSimpleName() + "_" + getName() + "_" + sourceExtension + "_", + "." + targetExtension); + targetWriter = new FileContentWriter(targetFile); + + // do the transformation + sourceReader.setMimetype(sourceMimetype); + targetWriter.setMimetype(targetMimetype); + transformer.transform(sourceReader.getReader(), targetWriter); + + // if the target format is any type of text, then it must contain the 'quick' phrase + if (isQuickPhraseExpected(targetMimetype)) + { + ContentReader targetReader = targetWriter.getReader(); + String checkContent = targetReader.getContentString(); + assertTrue("Quick phrase not present in document converted to text: \n" + + " transformer: " + transformer + "\n" + + " source: " + sourceReader + "\n" + + " target: " + targetWriter, + checkContent.contains(QUICK_CONTENT)); + + // Let subclasses do extra checks if they want + additionalContentCheck(sourceMimetype, targetMimetype, checkContent); + } + else if (isQuickWordsExpected(targetMimetype)) + { + ContentReader targetReader = targetWriter.getReader(); + String checkContent = targetReader.getContentString(); + // essentially check that FTS indexing can use the conversion properly + for (int word = 0; word < QUICK_WORDS.length; word++) + { + assertTrue("Quick phrase word not present in document converted to text: \n" + + " transformer: " + transformer + "\n" + + " source: " + sourceReader + "\n" + + " target: " + targetWriter + "\n" + + " word: " + word, + checkContent.contains(QUICK_WORDS[word])); + } + } + // increment count + count++; + } + long after = System.currentTimeMillis(); + double average = (double) (after - before) / (double) count; + + // dump + sb.append(String.format(" average %10.0f ms", average)).append("\n"); + + if (logger.isDebugEnabled()) + { + logger.debug("Transformation performed " + count + " time: " + + sourceMimetype + " --> " + targetMimetype + "\n" + + " source: " + sourceReader + "\n" + + " target: " + targetWriter + "\n" + + " transformer: " + getTransformer(sourceMimetype, targetMimetype)); + } + } } } diff --git a/source/java/org/alfresco/repo/content/transform/PoiContentTransformer.java b/source/java/org/alfresco/repo/content/transform/PoiContentTransformer.java index 3b0c8e7825..ba9aa366bd 100644 --- a/source/java/org/alfresco/repo/content/transform/PoiContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/PoiContentTransformer.java @@ -46,7 +46,7 @@ public class PoiContentTransformer extends TikaPoweredContentTransformer public static ArrayList SUPPORTED_MIMETYPES; static { SUPPORTED_MIMETYPES = new ArrayList(); - OfficeParser p = new OfficeParser(); + Parser p = new OfficeParser(); for(MediaType mt : p.getSupportedTypes(null)) { if(mt.toString().equals(MimetypeMap.MIMETYPE_EXCEL)) { diff --git a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java index b62fb77d3e..077906ea5e 100644 --- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java @@ -24,11 +24,11 @@ import java.util.regex.Pattern; import javax.xml.transform.TransformerConfigurationException; import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.repo.content.TikaOfficeDetectParser; import org.alfresco.service.cmr.repository.TransformationOptions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tika.parser.Parser; -import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; @@ -56,14 +56,15 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer public PoiHssfContentTransformer() { super(new String[] { - MimetypeMap.MIMETYPE_EXCEL + MimetypeMap.MIMETYPE_EXCEL, + MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET }); } @Override protected Parser getParser() { - return new OfficeParser(); + return new TikaOfficeDetectParser(); } /** diff --git a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java index dcc7d47fb7..d87bc8d96c 100644 --- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java +++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java @@ -46,7 +46,14 @@ public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformer transformer = new PoiHssfContentTransformer(); } - /** + @Override + protected String[] getQuickFilenames(String sourceMimetype) { + return new String[] { + "quick.xls", "quick.xlsx" + }; + } + + /** * @return Returns the same transformer regardless - it is allowed */ protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype) diff --git a/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformer.java b/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformer.java new file mode 100644 index 0000000000..b0e24e5c0c --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformer.java @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.transform; + +import java.util.ArrayList; + +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; + +/** + * Uses {@link http://tika.apache.org/ Apache Tika} and + * {@link http://poi.apache.org/ Apache POI} to perform + * conversions from the newer OOXML Office documents. + * + * @author Nick Burch + */ +public class PoiOOXMLContentTransformer extends TikaPoweredContentTransformer +{ + /** + * We support all the office mimetypes that the Tika + * office parser can handle + */ + public static ArrayList SUPPORTED_MIMETYPES; + static { + SUPPORTED_MIMETYPES = new ArrayList(); + Parser p = new OOXMLParser(); + for(MediaType mt : p.getSupportedTypes(null)) { + SUPPORTED_MIMETYPES.add( mt.toString() ); + } + } + + public PoiOOXMLContentTransformer() { + super(SUPPORTED_MIMETYPES); + } + + @Override + protected Parser getParser() { + return new OOXMLParser(); + } +} diff --git a/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformerTest.java new file mode 100644 index 0000000000..aea7de61d2 --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/PoiOOXMLContentTransformerTest.java @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.transform; + +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.cmr.repository.TransformationOptions; + +/** + * @see org.alfresco.repo.content.transform.PoiOOXMLContentTransformer + * + * @author Nick Burch + */ +public class PoiOOXMLContentTransformerTest extends AbstractContentTransformerTest +{ + private ContentTransformer transformer; + + @Override + public void setUp() throws Exception + { + super.setUp(); + + transformer = new PoiOOXMLContentTransformer(); + } + + /** + * @return Returns the same transformer regardless - it is allowed + */ + protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype) + { + return transformer; + } + + public void testIsTransformable() throws Exception + { + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + } +} diff --git a/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformerTest.java index 5222dd8a99..cf7c967a9f 100644 --- a/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformerTest.java +++ b/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformerTest.java @@ -52,6 +52,13 @@ public class TextMiningContentTransformerTest extends AbstractContentTransformer return transformer; } + @Override + protected String[] getQuickFilenames(String sourceMimetype) { + return new String[] { + "quick.doc", "quick95.doc", "quick6.doc" + }; + } + public void testIsTransformable() throws Exception { assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));