mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-08-07 17:49:17 +00:00
More Tika content transform updates
New POI-general converter, for things other than excel, and convert the PDF converter too. The POI-excel converter now does CSV properly, and notes exist for the Text mining converter on the Tika bits needed before it can be replaced. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20780 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -367,10 +367,16 @@
|
||||
class="org.alfresco.repo.content.transform.TikaAutoContentTransformer"
|
||||
parent="baseContentTransformer" />
|
||||
|
||||
<!-- This one does excel only -->
|
||||
<bean id="transformer.Poi"
|
||||
class="org.alfresco.repo.content.transform.PoiHssfContentTransformer"
|
||||
parent="baseContentTransformer" />
|
||||
|
||||
<!-- This one does the other office formats, like word and powerpoint -->
|
||||
<bean id="transformer.Office"
|
||||
class="org.alfresco.repo.content.transform.PoiContentTransformer"
|
||||
parent="baseContentTransformer" />
|
||||
|
||||
<bean id="transformer.TextMining"
|
||||
class="org.alfresco.repo.content.transform.TextMiningContentTransformer"
|
||||
parent="baseContentTransformer" >
|
||||
|
@@ -37,6 +37,7 @@ import org.alfresco.repo.content.transform.MailContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.MediaWikiContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.PoiContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
|
||||
@@ -104,6 +105,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
|
||||
suite.addTestSuite(MediaWikiContentTransformerTest.class);
|
||||
suite.addTestSuite(OpenOfficeContentTransformerTest.class);
|
||||
suite.addTestSuite(PdfBoxContentTransformerTest.class);
|
||||
suite.addTestSuite(PoiContentTransformerTest.class);
|
||||
suite.addTestSuite(PoiHssfContentTransformerTest.class);
|
||||
suite.addTestSuite(RuntimeExecutableContentTransformerTest.class);
|
||||
suite.addTestSuite(StringExtractingContentTransformerTest.class);
|
||||
|
@@ -43,8 +43,10 @@ import org.springframework.beans.factory.InitializingBean;
|
||||
import org.springframework.core.io.DefaultResourceLoader;
|
||||
|
||||
/**
|
||||
* Makes use of the {@link http://sourceforge.net/projects/joott/JOOConverter} library to perform OpenOffice-drive
|
||||
* conversions.
|
||||
* Makes use of the {@link http://sourceforge.net/projects/joott/JOOConverter} library to perform
|
||||
* OpenOffice-driven conversions.
|
||||
* This requires that OpenOffice be running, but delivers a wider range of transformations
|
||||
* than Tika is able to (Tika just translates into Text, HTML and XML)
|
||||
*
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
|
@@ -18,71 +18,28 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.util.PDFTextStripper;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
|
||||
/**
|
||||
* Makes use of the {@link http://www.pdfbox.org/ PDFBox} library to
|
||||
* perform conversions from PDF files to text.
|
||||
* Uses {@link http://tika.apache.org/ Apache Tika} and
|
||||
* {@link http://pdfbox.apache.org/ Apache PDFBox} to perform
|
||||
* conversions from PDF documents.
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public class PdfBoxContentTransformer extends AbstractContentTransformer2
|
||||
public class PdfBoxContentTransformer extends TikaPoweredContentTransformer
|
||||
{
|
||||
/**
|
||||
* Currently the only transformation performed is that of text extraction from PDF documents.
|
||||
*/
|
||||
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
|
||||
{
|
||||
// TODO: Expand PDFBox usage to convert images to PDF and investigate other conversions
|
||||
|
||||
if (!MimetypeMap.MIMETYPE_PDF.equals(sourceMimetype) ||
|
||||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
||||
{
|
||||
// only support PDF -> Text
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
return true;
|
||||
}
|
||||
public PdfBoxContentTransformer() {
|
||||
super(new String[] {
|
||||
MimetypeMap.MIMETYPE_PDF
|
||||
});
|
||||
}
|
||||
|
||||
protected void transformInternal(
|
||||
ContentReader reader,
|
||||
ContentWriter writer,
|
||||
TransformationOptions options) throws Exception
|
||||
{
|
||||
PDDocument pdf = null;
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
// stream the document in
|
||||
pdf = PDDocument.load(is);
|
||||
// strip the text out
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
String text = stripper.getText(pdf);
|
||||
|
||||
// dump it all to the writer
|
||||
writer.putContent(text);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (pdf != null)
|
||||
{
|
||||
try { pdf.close(); } catch (Throwable e) {e.printStackTrace(); }
|
||||
}
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (Throwable e) {e.printStackTrace(); }
|
||||
}
|
||||
}
|
||||
@Override
|
||||
protected Parser getParser() {
|
||||
return new PDFParser();
|
||||
}
|
||||
}
|
||||
|
@@ -50,5 +50,7 @@ public class PdfBoxContentTransformerTest extends AbstractContentTransformerTest
|
||||
{
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PDF, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
|
||||
/**
|
||||
* Uses {@link http://tika.apache.org/ Apache Tika} and
|
||||
* {@link http://poi.apache.org/ Apache POI} to perform
|
||||
* conversions from Office documents.
|
||||
*
|
||||
* {@link PoiHssfContentTransformer} handles the Excel
|
||||
* transformations (mostly for compatibility), while
|
||||
* this does all the other Office file formats.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class PoiContentTransformer extends TikaPoweredContentTransformer
|
||||
{
|
||||
/**
|
||||
* We support all the office mimetypes that the Tika
|
||||
* office parser can handle, except for excel
|
||||
* (handled by {@link PoiHssfContentTransformer}
|
||||
*/
|
||||
public static ArrayList<String> SUPPORTED_MIMETYPES;
|
||||
static {
|
||||
SUPPORTED_MIMETYPES = new ArrayList<String>();
|
||||
OfficeParser p = new OfficeParser();
|
||||
for(MediaType mt : p.getSupportedTypes(null)) {
|
||||
if(mt.toString().equals(MimetypeMap.MIMETYPE_EXCEL))
|
||||
{
|
||||
// Skip, handled elsewhere
|
||||
continue;
|
||||
}
|
||||
// Tika can probably do some useful text
|
||||
SUPPORTED_MIMETYPES.add( mt.toString() );
|
||||
}
|
||||
}
|
||||
|
||||
public PoiContentTransformer() {
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser() {
|
||||
return new OfficeParser();
|
||||
}
|
||||
}
|
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.PoiContentTransformer
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class PoiContentTransformerTest extends AbstractContentTransformerTest
|
||||
{
|
||||
private ContentTransformer transformer;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception
|
||||
{
|
||||
super.setUp();
|
||||
|
||||
transformer = new PoiContentTransformer();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the same transformer regardless - it is allowed
|
||||
*/
|
||||
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
|
||||
{
|
||||
return transformer;
|
||||
}
|
||||
|
||||
public void testIsTransformable() throws Exception
|
||||
{
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PPT, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OUTLOOK_MSG, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||
|
||||
// Doesn't claim excel
|
||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||
}
|
||||
}
|
@@ -42,8 +42,6 @@ import org.xml.sax.SAXException;
|
||||
* Xml or Text (space or comma separated)
|
||||
* <p>Handles all sheets in the file.
|
||||
*
|
||||
* TODO CSV Support
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
@@ -103,7 +101,6 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
|
||||
* A wrapper around the normal Tika BodyContentHandler,
|
||||
* which causes things to be CSV encoded rather than
|
||||
* tab separated
|
||||
* TODO Get rid of the extra tabs that crop up
|
||||
*/
|
||||
protected static class CsvContentHandler extends BodyContentHandler {
|
||||
private static final char[] comma = new char[]{ ',' };
|
||||
@@ -116,6 +113,16 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
|
||||
super(output);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void ignorableWhitespace(char[] ch, int start, int length)
|
||||
throws SAXException {
|
||||
if(length == 1 && ch[0] == '\t') {
|
||||
// Ignore tabs, as they mess up the CSV output
|
||||
} else {
|
||||
super.ignorableWhitespace(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length)
|
||||
throws SAXException {
|
||||
@@ -150,32 +157,28 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
|
||||
public void startElement(String uri, String localName, String name,
|
||||
Attributes atts) throws SAXException {
|
||||
if(localName.equals("td")) {
|
||||
localName = "span";
|
||||
name = "span";
|
||||
|
||||
inCell = true;
|
||||
if(needsComma) {
|
||||
super.characters(comma, 0, 1);
|
||||
needsComma = true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
super.startElement(uri, localName, name, atts);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String name)
|
||||
throws SAXException {
|
||||
if(localName.equals("td")) {
|
||||
localName = "span";
|
||||
name = "span";
|
||||
|
||||
needsComma = true;
|
||||
inCell = false;
|
||||
}
|
||||
} else {
|
||||
if(localName.equals("tr")) {
|
||||
needsComma = false;
|
||||
}
|
||||
super.endElement(uri, localName, name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -79,14 +79,26 @@ public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformer
|
||||
|
||||
ContentReader targetReader = targetWriter.getReader();
|
||||
String checkContent = targetReader.getContentString();
|
||||
System.err.println(checkContent);
|
||||
|
||||
additionalContentCheck(
|
||||
MimetypeMap.MIMETYPE_EXCEL,
|
||||
MimetypeMap.MIMETYPE_TEXT_CSV,
|
||||
checkContent
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void additionalContentCheck(String sourceMimetype,
|
||||
String targetMimetype, String contents) {
|
||||
if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
|
||||
System.err.println(contents);
|
||||
assertTrue(
|
||||
"Content not properly CSV'd",
|
||||
contents.contains("1,2,2")
|
||||
);
|
||||
assertTrue(
|
||||
"Content not properly CSV'd",
|
||||
contents.contains("\"The\",\"quick\",\"brown\",\"fox\"")
|
||||
);
|
||||
} else {
|
||||
super.additionalContentCheck(sourceMimetype, targetMimetype, contents);
|
||||
}
|
||||
|
@@ -32,6 +32,11 @@ import org.textmining.extraction.word.WordTextExtractorFactory;
|
||||
* Makes use of the {@link http://www.textmining.org/ TextMining} library to
|
||||
* perform conversions from MSWord documents to text.
|
||||
*
|
||||
* Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
|
||||
* do this, as Tika can't handle Word 6 or Word 95 documents, only
|
||||
* Word 97, 2000, 2003, 2007 and 2010.
|
||||
* Once Tika does support these older formats, we can switch to it.
|
||||
*
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public class TextMiningContentTransformer extends AbstractContentTransformer2
|
||||
|
BIN
source/test-resources/quick/quick6.doc
Normal file
BIN
source/test-resources/quick/quick6.doc
Normal file
Binary file not shown.
BIN
source/test-resources/quick/quick95.doc
Normal file
BIN
source/test-resources/quick/quick95.doc
Normal file
Binary file not shown.
Reference in New Issue
Block a user