More Tika content transform updates

New POI-general converter, for things other than excel, and convert the PDF converter too.
The POI-excel converter now does CSV properly, and notes exist for the Text mining converter on the Tika bits needed before it can be replaced.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20780 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-06-23 14:27:10 +00:00
parent 2435a26cab
commit 228d111c56
12 changed files with 202 additions and 75 deletions

View File

@@ -367,10 +367,16 @@
class="org.alfresco.repo.content.transform.TikaAutoContentTransformer" class="org.alfresco.repo.content.transform.TikaAutoContentTransformer"
parent="baseContentTransformer" /> parent="baseContentTransformer" />
<!-- This one does excel only -->
<bean id="transformer.Poi" <bean id="transformer.Poi"
class="org.alfresco.repo.content.transform.PoiHssfContentTransformer" class="org.alfresco.repo.content.transform.PoiHssfContentTransformer"
parent="baseContentTransformer" /> parent="baseContentTransformer" />
<!-- This one does the other office formats, like word and powerpoint -->
<bean id="transformer.Office"
class="org.alfresco.repo.content.transform.PoiContentTransformer"
parent="baseContentTransformer" />
<bean id="transformer.TextMining" <bean id="transformer.TextMining"
class="org.alfresco.repo.content.transform.TextMiningContentTransformer" class="org.alfresco.repo.content.transform.TextMiningContentTransformer"
parent="baseContentTransformer" > parent="baseContentTransformer" >

View File

@@ -37,6 +37,7 @@ import org.alfresco.repo.content.transform.MailContentTransformerTest;
import org.alfresco.repo.content.transform.MediaWikiContentTransformerTest; import org.alfresco.repo.content.transform.MediaWikiContentTransformerTest;
import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest; import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest;
import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest; import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest;
import org.alfresco.repo.content.transform.PoiContentTransformerTest;
import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest; import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest;
import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest; import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest;
import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest; import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
@@ -104,6 +105,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
suite.addTestSuite(MediaWikiContentTransformerTest.class); suite.addTestSuite(MediaWikiContentTransformerTest.class);
suite.addTestSuite(OpenOfficeContentTransformerTest.class); suite.addTestSuite(OpenOfficeContentTransformerTest.class);
suite.addTestSuite(PdfBoxContentTransformerTest.class); suite.addTestSuite(PdfBoxContentTransformerTest.class);
suite.addTestSuite(PoiContentTransformerTest.class);
suite.addTestSuite(PoiHssfContentTransformerTest.class); suite.addTestSuite(PoiHssfContentTransformerTest.class);
suite.addTestSuite(RuntimeExecutableContentTransformerTest.class); suite.addTestSuite(RuntimeExecutableContentTransformerTest.class);
suite.addTestSuite(StringExtractingContentTransformerTest.class); suite.addTestSuite(StringExtractingContentTransformerTest.class);

View File

@@ -43,8 +43,10 @@ import org.springframework.beans.factory.InitializingBean;
import org.springframework.core.io.DefaultResourceLoader; import org.springframework.core.io.DefaultResourceLoader;
/** /**
* Makes use of the {@link http://sourceforge.net/projects/joott/JOOConverter} library to perform OpenOffice-drive * Makes use of the {@link http://sourceforge.net/projects/joott/JOOConverter} library to perform
* conversions. * OpenOffice-driven conversions.
* This requires that OpenOffice be running, but delivers a wider range of transformations
* than Tika is able to (Tika just translates into Text, HTML and XML)
* *
* @author Derek Hulley * @author Derek Hulley
*/ */

View File

@@ -18,71 +18,28 @@
*/ */
package org.alfresco.repo.content.transform; package org.alfresco.repo.content.transform;
import java.io.InputStream;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader; import org.apache.tika.parser.Parser;
import org.alfresco.service.cmr.repository.ContentWriter; import org.apache.tika.parser.pdf.PDFParser;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
/** /**
* Makes use of the {@link http://www.pdfbox.org/ PDFBox} library to * Uses {@link http://tika.apache.org/ Apache Tika} and
* perform conversions from PDF files to text. * {@link http://pdfbox.apache.org/ Apache PDFBox} to perform
* conversions from PDF documents.
* *
* @author Nick Burch
* @author Derek Hulley * @author Derek Hulley
*/ */
public class PdfBoxContentTransformer extends AbstractContentTransformer2 public class PdfBoxContentTransformer extends TikaPoweredContentTransformer
{ {
/** public PdfBoxContentTransformer() {
* Currently the only transformation performed is that of text extraction from PDF documents. super(new String[] {
*/ MimetypeMap.MIMETYPE_PDF
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options) });
{
// TODO: Expand PDFBox usage to convert images to PDF and investigate other conversions
if (!MimetypeMap.MIMETYPE_PDF.equals(sourceMimetype) ||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
{
// only support PDF -> Text
return false;
}
else
{
return true;
}
} }
protected void transformInternal( @Override
ContentReader reader, protected Parser getParser() {
ContentWriter writer, return new PDFParser();
TransformationOptions options) throws Exception
{
PDDocument pdf = null;
InputStream is = null;
try
{
is = reader.getContentInputStream();
// stream the document in
pdf = PDDocument.load(is);
// strip the text out
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(pdf);
// dump it all to the writer
writer.putContent(text);
}
finally
{
if (pdf != null)
{
try { pdf.close(); } catch (Throwable e) {e.printStackTrace(); }
}
if (is != null)
{
try { is.close(); } catch (Throwable e) {e.printStackTrace(); }
}
}
} }
} }

View File

@@ -50,5 +50,7 @@ public class PdfBoxContentTransformerTest extends AbstractContentTransformerTest
{ {
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PDF, new TransformationOptions())); assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PDF, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions())); assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
} }
} }

View File

@@ -0,0 +1,69 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import java.util.ArrayList;
import org.alfresco.repo.content.MimetypeMap;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
/**
* Uses {@link http://tika.apache.org/ Apache Tika} and
* {@link http://poi.apache.org/ Apache POI} to perform
* conversions from Office documents.
*
* {@link PoiHssfContentTransformer} handles the Excel
* transformations (mostly for compatibility), while
* this does all the other Office file formats.
*
* @author Nick Burch
*/
public class PoiContentTransformer extends TikaPoweredContentTransformer
{
/**
* We support all the office mimetypes that the Tika
* office parser can handle, except for excel
* (handled by {@link PoiHssfContentTransformer}
*/
public static ArrayList<String> SUPPORTED_MIMETYPES;
static {
SUPPORTED_MIMETYPES = new ArrayList<String>();
OfficeParser p = new OfficeParser();
for(MediaType mt : p.getSupportedTypes(null)) {
if(mt.toString().equals(MimetypeMap.MIMETYPE_EXCEL))
{
// Skip, handled elsewhere
continue;
}
// Tika can probably do some useful text
SUPPORTED_MIMETYPES.add( mt.toString() );
}
}
public PoiContentTransformer() {
super(SUPPORTED_MIMETYPES);
}
@Override
protected Parser getParser() {
return new OfficeParser();
}
}

View File

@@ -0,0 +1,69 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.TransformationOptions;
/**
* @see org.alfresco.repo.content.transform.PoiContentTransformer
*
* @author Nick Burch
*/
public class PoiContentTransformerTest extends AbstractContentTransformerTest
{
private ContentTransformer transformer;
@Override
public void setUp() throws Exception
{
super.setUp();
transformer = new PoiContentTransformer();
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
{
return transformer;
}
public void testIsTransformable() throws Exception
{
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PPT, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OUTLOOK_MSG, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
// Doesn't claim excel
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
}
}

View File

@@ -42,8 +42,6 @@ import org.xml.sax.SAXException;
* Xml or Text (space or comma separated) * Xml or Text (space or comma separated)
* <p>Handles all sheets in the file. * <p>Handles all sheets in the file.
* *
* TODO CSV Support
*
* @author Nick Burch * @author Nick Burch
* @author Derek Hulley * @author Derek Hulley
*/ */
@@ -103,7 +101,6 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
* A wrapper around the normal Tika BodyContentHandler, * A wrapper around the normal Tika BodyContentHandler,
* which causes things to be CSV encoded rather than * which causes things to be CSV encoded rather than
* tab separated * tab separated
* TODO Get rid of the extra tabs that crop up
*/ */
protected static class CsvContentHandler extends BodyContentHandler { protected static class CsvContentHandler extends BodyContentHandler {
private static final char[] comma = new char[]{ ',' }; private static final char[] comma = new char[]{ ',' };
@@ -116,6 +113,16 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
super(output); super(output);
} }
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
if(length == 1 && ch[0] == '\t') {
// Ignore tabs, as they mess up the CSV output
} else {
super.ignorableWhitespace(ch, start, length);
}
}
@Override @Override
public void characters(char[] ch, int start, int length) public void characters(char[] ch, int start, int length)
throws SAXException { throws SAXException {
@@ -150,32 +157,28 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
public void startElement(String uri, String localName, String name, public void startElement(String uri, String localName, String name,
Attributes atts) throws SAXException { Attributes atts) throws SAXException {
if(localName.equals("td")) { if(localName.equals("td")) {
localName = "span";
name = "span";
inCell = true; inCell = true;
if(needsComma) { if(needsComma) {
super.characters(comma, 0, 1); super.characters(comma, 0, 1);
needsComma = true; needsComma = true;
} }
} else {
super.startElement(uri, localName, name, atts);
} }
super.startElement(uri, localName, name, atts);
} }
@Override @Override
public void endElement(String uri, String localName, String name) public void endElement(String uri, String localName, String name)
throws SAXException { throws SAXException {
if(localName.equals("td")) { if(localName.equals("td")) {
localName = "span";
name = "span";
needsComma = true; needsComma = true;
inCell = false; inCell = false;
} else {
if(localName.equals("tr")) {
needsComma = false;
}
super.endElement(uri, localName, name);
} }
if(localName.equals("tr")) {
needsComma = false;
}
super.endElement(uri, localName, name);
} }
} }
} }

View File

@@ -79,14 +79,26 @@ public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformer
ContentReader targetReader = targetWriter.getReader(); ContentReader targetReader = targetWriter.getReader();
String checkContent = targetReader.getContentString(); String checkContent = targetReader.getContentString();
System.err.println(checkContent);
additionalContentCheck(
MimetypeMap.MIMETYPE_EXCEL,
MimetypeMap.MIMETYPE_TEXT_CSV,
checkContent
);
} }
@Override @Override
protected void additionalContentCheck(String sourceMimetype, protected void additionalContentCheck(String sourceMimetype,
String targetMimetype, String contents) { String targetMimetype, String contents) {
if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) { if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
System.err.println(contents); assertTrue(
"Content not properly CSV'd",
contents.contains("1,2,2")
);
assertTrue(
"Content not properly CSV'd",
contents.contains("\"The\",\"quick\",\"brown\",\"fox\"")
);
} else { } else {
super.additionalContentCheck(sourceMimetype, targetMimetype, contents); super.additionalContentCheck(sourceMimetype, targetMimetype, contents);
} }

View File

@@ -32,6 +32,11 @@ import org.textmining.extraction.word.WordTextExtractorFactory;
* Makes use of the {@link http://www.textmining.org/ TextMining} library to * Makes use of the {@link http://www.textmining.org/ TextMining} library to
* perform conversions from MSWord documents to text. * perform conversions from MSWord documents to text.
* *
* Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
* do this, as Tika can't handle Word 6 or Word 95 documents, only
* Word 97, 2000, 2003, 2007 and 2010.
* Once Tika does support these older formats, we can switch to it.
*
* @author Derek Hulley * @author Derek Hulley
*/ */
public class TextMiningContentTransformer extends AbstractContentTransformer2 public class TextMiningContentTransformer extends AbstractContentTransformer2

Binary file not shown.

Binary file not shown.