diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml index 06cdb84395..2b8bbb536e 100644 --- a/config/alfresco/content-services-context.xml +++ b/config/alfresco/content-services-context.xml @@ -367,10 +367,16 @@ class="org.alfresco.repo.content.transform.TikaAutoContentTransformer" parent="baseContentTransformer" /> + + + + diff --git a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java index 30f44844e2..04b0e26567 100644 --- a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java +++ b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java @@ -37,6 +37,7 @@ import org.alfresco.repo.content.transform.MailContentTransformerTest; import org.alfresco.repo.content.transform.MediaWikiContentTransformerTest; import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest; import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest; +import org.alfresco.repo.content.transform.PoiContentTransformerTest; import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest; import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest; import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest; @@ -104,6 +105,7 @@ public class ContentMinimalContextTestSuite extends TestSuite suite.addTestSuite(MediaWikiContentTransformerTest.class); suite.addTestSuite(OpenOfficeContentTransformerTest.class); suite.addTestSuite(PdfBoxContentTransformerTest.class); + suite.addTestSuite(PoiContentTransformerTest.class); suite.addTestSuite(PoiHssfContentTransformerTest.class); suite.addTestSuite(RuntimeExecutableContentTransformerTest.class); suite.addTestSuite(StringExtractingContentTransformerTest.class); diff --git a/source/java/org/alfresco/repo/content/transform/OpenOfficeContentTransformerWorker.java b/source/java/org/alfresco/repo/content/transform/OpenOfficeContentTransformerWorker.java index c9ae31db64..8bcd358ba6 100644 --- a/source/java/org/alfresco/repo/content/transform/OpenOfficeContentTransformerWorker.java +++ b/source/java/org/alfresco/repo/content/transform/OpenOfficeContentTransformerWorker.java @@ -43,8 +43,10 @@ import org.springframework.beans.factory.InitializingBean; import org.springframework.core.io.DefaultResourceLoader; /** - * Makes use of the {@link http://sourceforge.net/projects/joott/JOOConverter} library to perform OpenOffice-drive - * conversions. + * Makes use of the {@link http://sourceforge.net/projects/joott/JOOConverter} library to perform + * OpenOffice-driven conversions. + * This requires that OpenOffice be running, but delivers a wider range of transformations + * than Tika is able to (Tika just translates into Text, HTML and XML) * * @author Derek Hulley */ diff --git a/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java b/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java index 0ca6f122d3..72ecef9308 100644 --- a/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformer.java @@ -18,71 +18,28 @@ */ package org.alfresco.repo.content.transform; -import java.io.InputStream; - import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentReader; -import org.alfresco.service.cmr.repository.ContentWriter; -import org.alfresco.service.cmr.repository.TransformationOptions; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.util.PDFTextStripper; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.pdf.PDFParser; /** - * Makes use of the {@link http://www.pdfbox.org/ PDFBox} library to - * perform conversions from PDF files to text. + * Uses {@link http://tika.apache.org/ Apache Tika} and + * {@link http://pdfbox.apache.org/ Apache PDFBox} to perform + * conversions from PDF documents. * + * @author Nick Burch * @author Derek Hulley */ -public class PdfBoxContentTransformer extends AbstractContentTransformer2 +public class PdfBoxContentTransformer extends TikaPoweredContentTransformer { - /** - * Currently the only transformation performed is that of text extraction from PDF documents. - */ - public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options) - { - // TODO: Expand PDFBox usage to convert images to PDF and investigate other conversions - - if (!MimetypeMap.MIMETYPE_PDF.equals(sourceMimetype) || - !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype)) - { - // only support PDF -> Text - return false; - } - else - { - return true; - } + public PdfBoxContentTransformer() { + super(new String[] { + MimetypeMap.MIMETYPE_PDF + }); } - protected void transformInternal( - ContentReader reader, - ContentWriter writer, - TransformationOptions options) throws Exception - { - PDDocument pdf = null; - InputStream is = null; - try - { - is = reader.getContentInputStream(); - // stream the document in - pdf = PDDocument.load(is); - // strip the text out - PDFTextStripper stripper = new PDFTextStripper(); - String text = stripper.getText(pdf); - - // dump it all to the writer - writer.putContent(text); - } - finally - { - if (pdf != null) - { - try { pdf.close(); } catch (Throwable e) {e.printStackTrace(); } - } - if (is != null) - { - try { is.close(); } catch (Throwable e) {e.printStackTrace(); } - } - } + @Override + protected Parser getParser() { + return new PDFParser(); } } diff --git a/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java index 3a1f7adde6..bcceb505de 100644 --- a/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java +++ b/source/java/org/alfresco/repo/content/transform/PdfBoxContentTransformerTest.java @@ -50,5 +50,7 @@ public class PdfBoxContentTransformerTest extends AbstractContentTransformerTest { assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PDF, new TransformationOptions())); assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); } } diff --git a/source/java/org/alfresco/repo/content/transform/PoiContentTransformer.java b/source/java/org/alfresco/repo/content/transform/PoiContentTransformer.java new file mode 100644 index 0000000000..3b0c8e7825 --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/PoiContentTransformer.java @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.transform; + +import java.util.ArrayList; + +import org.alfresco.repo.content.MimetypeMap; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.OfficeParser; + +/** + * Uses {@link http://tika.apache.org/ Apache Tika} and + * {@link http://poi.apache.org/ Apache POI} to perform + * conversions from Office documents. + * + * {@link PoiHssfContentTransformer} handles the Excel + * transformations (mostly for compatibility), while + * this does all the other Office file formats. + * + * @author Nick Burch + */ +public class PoiContentTransformer extends TikaPoweredContentTransformer +{ + /** + * We support all the office mimetypes that the Tika + * office parser can handle, except for excel + * (handled by {@link PoiHssfContentTransformer} + */ + public static ArrayList SUPPORTED_MIMETYPES; + static { + SUPPORTED_MIMETYPES = new ArrayList(); + OfficeParser p = new OfficeParser(); + for(MediaType mt : p.getSupportedTypes(null)) { + if(mt.toString().equals(MimetypeMap.MIMETYPE_EXCEL)) + { + // Skip, handled elsewhere + continue; + } + // Tika can probably do some useful text + SUPPORTED_MIMETYPES.add( mt.toString() ); + } + } + + public PoiContentTransformer() { + super(SUPPORTED_MIMETYPES); + } + + @Override + protected Parser getParser() { + return new OfficeParser(); + } +} diff --git a/source/java/org/alfresco/repo/content/transform/PoiContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/PoiContentTransformerTest.java new file mode 100644 index 0000000000..28e49c261a --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/PoiContentTransformerTest.java @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.transform; + +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.cmr.repository.TransformationOptions; + +/** + * @see org.alfresco.repo.content.transform.PoiContentTransformer + * + * @author Nick Burch + */ +public class PoiContentTransformerTest extends AbstractContentTransformerTest +{ + private ContentTransformer transformer; + + @Override + public void setUp() throws Exception + { + super.setUp(); + + transformer = new PoiContentTransformer(); + } + + /** + * @return Returns the same transformer regardless - it is allowed + */ + protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype) + { + return transformer; + } + + public void testIsTransformable() throws Exception + { + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PPT, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OUTLOOK_MSG, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_HTML, new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_XML, new TransformationOptions())); + + // Doesn't claim excel + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); + } +} diff --git a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java index 5bc453e5c0..b62fb77d3e 100644 --- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java @@ -42,8 +42,6 @@ import org.xml.sax.SAXException; * Xml or Text (space or comma separated) *

Handles all sheets in the file. * - * TODO CSV Support - * * @author Nick Burch * @author Derek Hulley */ @@ -103,7 +101,6 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer * A wrapper around the normal Tika BodyContentHandler, * which causes things to be CSV encoded rather than * tab separated - * TODO Get rid of the extra tabs that crop up */ protected static class CsvContentHandler extends BodyContentHandler { private static final char[] comma = new char[]{ ',' }; @@ -116,6 +113,16 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer super(output); } + @Override + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + if(length == 1 && ch[0] == '\t') { + // Ignore tabs, as they mess up the CSV output + } else { + super.ignorableWhitespace(ch, start, length); + } + } + @Override public void characters(char[] ch, int start, int length) throws SAXException { @@ -150,32 +157,28 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException { if(localName.equals("td")) { - localName = "span"; - name = "span"; - inCell = true; if(needsComma) { super.characters(comma, 0, 1); needsComma = true; } + } else { + super.startElement(uri, localName, name, atts); } - super.startElement(uri, localName, name, atts); } @Override public void endElement(String uri, String localName, String name) throws SAXException { if(localName.equals("td")) { - localName = "span"; - name = "span"; - needsComma = true; inCell = false; + } else { + if(localName.equals("tr")) { + needsComma = false; + } + super.endElement(uri, localName, name); } - if(localName.equals("tr")) { - needsComma = false; - } - super.endElement(uri, localName, name); } } } diff --git a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java index 651c5c8bf3..dcc7d47fb7 100644 --- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java +++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java @@ -79,14 +79,26 @@ public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformer ContentReader targetReader = targetWriter.getReader(); String checkContent = targetReader.getContentString(); - System.err.println(checkContent); + + additionalContentCheck( + MimetypeMap.MIMETYPE_EXCEL, + MimetypeMap.MIMETYPE_TEXT_CSV, + checkContent + ); } @Override protected void additionalContentCheck(String sourceMimetype, String targetMimetype, String contents) { if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) { - System.err.println(contents); + assertTrue( + "Content not properly CSV'd", + contents.contains("1,2,2") + ); + assertTrue( + "Content not properly CSV'd", + contents.contains("\"The\",\"quick\",\"brown\",\"fox\"") + ); } else { super.additionalContentCheck(sourceMimetype, targetMimetype, contents); } diff --git a/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java b/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java index 98af61b069..0872a85d88 100644 --- a/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java @@ -32,6 +32,11 @@ import org.textmining.extraction.word.WordTextExtractorFactory; * Makes use of the {@link http://www.textmining.org/ TextMining} library to * perform conversions from MSWord documents to text. * + * Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to + * do this, as Tika can't handle Word 6 or Word 95 documents, only + * Word 97, 2000, 2003, 2007 and 2010. + * Once Tika does support these older formats, we can switch to it. + * * @author Derek Hulley */ public class TextMiningContentTransformer extends AbstractContentTransformer2 diff --git a/source/test-resources/quick/quick6.doc b/source/test-resources/quick/quick6.doc new file mode 100644 index 0000000000..a614a0783f Binary files /dev/null and b/source/test-resources/quick/quick6.doc differ diff --git a/source/test-resources/quick/quick95.doc b/source/test-resources/quick/quick95.doc new file mode 100644 index 0000000000..a614a0783f Binary files /dev/null and b/source/test-resources/quick/quick95.doc differ