mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-08-07 17:49:17 +00:00
More Tika content transform updates
New POI-general converter, for things other than excel, and convert the PDF converter too. The POI-excel converter now does CSV properly, and notes exist for the Text mining converter on the Tika bits needed before it can be replaced. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20780 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -367,10 +367,16 @@
|
|||||||
class="org.alfresco.repo.content.transform.TikaAutoContentTransformer"
|
class="org.alfresco.repo.content.transform.TikaAutoContentTransformer"
|
||||||
parent="baseContentTransformer" />
|
parent="baseContentTransformer" />
|
||||||
|
|
||||||
|
<!-- This one does excel only -->
|
||||||
<bean id="transformer.Poi"
|
<bean id="transformer.Poi"
|
||||||
class="org.alfresco.repo.content.transform.PoiHssfContentTransformer"
|
class="org.alfresco.repo.content.transform.PoiHssfContentTransformer"
|
||||||
parent="baseContentTransformer" />
|
parent="baseContentTransformer" />
|
||||||
|
|
||||||
|
<!-- This one does the other office formats, like word and powerpoint -->
|
||||||
|
<bean id="transformer.Office"
|
||||||
|
class="org.alfresco.repo.content.transform.PoiContentTransformer"
|
||||||
|
parent="baseContentTransformer" />
|
||||||
|
|
||||||
<bean id="transformer.TextMining"
|
<bean id="transformer.TextMining"
|
||||||
class="org.alfresco.repo.content.transform.TextMiningContentTransformer"
|
class="org.alfresco.repo.content.transform.TextMiningContentTransformer"
|
||||||
parent="baseContentTransformer" >
|
parent="baseContentTransformer" >
|
||||||
|
@@ -37,6 +37,7 @@ import org.alfresco.repo.content.transform.MailContentTransformerTest;
|
|||||||
import org.alfresco.repo.content.transform.MediaWikiContentTransformerTest;
|
import org.alfresco.repo.content.transform.MediaWikiContentTransformerTest;
|
||||||
import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest;
|
import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest;
|
||||||
import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest;
|
import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest;
|
||||||
|
import org.alfresco.repo.content.transform.PoiContentTransformerTest;
|
||||||
import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest;
|
import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest;
|
||||||
import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest;
|
import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest;
|
||||||
import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
|
import org.alfresco.repo.content.transform.StringExtractingContentTransformerTest;
|
||||||
@@ -104,6 +105,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
|
|||||||
suite.addTestSuite(MediaWikiContentTransformerTest.class);
|
suite.addTestSuite(MediaWikiContentTransformerTest.class);
|
||||||
suite.addTestSuite(OpenOfficeContentTransformerTest.class);
|
suite.addTestSuite(OpenOfficeContentTransformerTest.class);
|
||||||
suite.addTestSuite(PdfBoxContentTransformerTest.class);
|
suite.addTestSuite(PdfBoxContentTransformerTest.class);
|
||||||
|
suite.addTestSuite(PoiContentTransformerTest.class);
|
||||||
suite.addTestSuite(PoiHssfContentTransformerTest.class);
|
suite.addTestSuite(PoiHssfContentTransformerTest.class);
|
||||||
suite.addTestSuite(RuntimeExecutableContentTransformerTest.class);
|
suite.addTestSuite(RuntimeExecutableContentTransformerTest.class);
|
||||||
suite.addTestSuite(StringExtractingContentTransformerTest.class);
|
suite.addTestSuite(StringExtractingContentTransformerTest.class);
|
||||||
|
@@ -43,8 +43,10 @@ import org.springframework.beans.factory.InitializingBean;
|
|||||||
import org.springframework.core.io.DefaultResourceLoader;
|
import org.springframework.core.io.DefaultResourceLoader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Makes use of the {@link http://sourceforge.net/projects/joott/JOOConverter} library to perform OpenOffice-drive
|
* Makes use of the {@link http://sourceforge.net/projects/joott/JOOConverter} library to perform
|
||||||
* conversions.
|
* OpenOffice-driven conversions.
|
||||||
|
* This requires that OpenOffice be running, but delivers a wider range of transformations
|
||||||
|
* than Tika is able to (Tika just translates into Text, HTML and XML)
|
||||||
*
|
*
|
||||||
* @author Derek Hulley
|
* @author Derek Hulley
|
||||||
*/
|
*/
|
||||||
|
@@ -18,71 +18,28 @@
|
|||||||
*/
|
*/
|
||||||
package org.alfresco.repo.content.transform;
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
|
|
||||||
import org.alfresco.repo.content.MimetypeMap;
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
import org.apache.tika.parser.Parser;
|
||||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
import org.apache.tika.parser.pdf.PDFParser;
|
||||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
||||||
import org.apache.pdfbox.util.PDFTextStripper;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Makes use of the {@link http://www.pdfbox.org/ PDFBox} library to
|
* Uses {@link http://tika.apache.org/ Apache Tika} and
|
||||||
* perform conversions from PDF files to text.
|
* {@link http://pdfbox.apache.org/ Apache PDFBox} to perform
|
||||||
|
* conversions from PDF documents.
|
||||||
*
|
*
|
||||||
|
* @author Nick Burch
|
||||||
* @author Derek Hulley
|
* @author Derek Hulley
|
||||||
*/
|
*/
|
||||||
public class PdfBoxContentTransformer extends AbstractContentTransformer2
|
public class PdfBoxContentTransformer extends TikaPoweredContentTransformer
|
||||||
{
|
{
|
||||||
/**
|
public PdfBoxContentTransformer() {
|
||||||
* Currently the only transformation performed is that of text extraction from PDF documents.
|
super(new String[] {
|
||||||
*/
|
MimetypeMap.MIMETYPE_PDF
|
||||||
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
|
});
|
||||||
{
|
|
||||||
// TODO: Expand PDFBox usage to convert images to PDF and investigate other conversions
|
|
||||||
|
|
||||||
if (!MimetypeMap.MIMETYPE_PDF.equals(sourceMimetype) ||
|
|
||||||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
|
||||||
{
|
|
||||||
// only support PDF -> Text
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void transformInternal(
|
@Override
|
||||||
ContentReader reader,
|
protected Parser getParser() {
|
||||||
ContentWriter writer,
|
return new PDFParser();
|
||||||
TransformationOptions options) throws Exception
|
|
||||||
{
|
|
||||||
PDDocument pdf = null;
|
|
||||||
InputStream is = null;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
is = reader.getContentInputStream();
|
|
||||||
// stream the document in
|
|
||||||
pdf = PDDocument.load(is);
|
|
||||||
// strip the text out
|
|
||||||
PDFTextStripper stripper = new PDFTextStripper();
|
|
||||||
String text = stripper.getText(pdf);
|
|
||||||
|
|
||||||
// dump it all to the writer
|
|
||||||
writer.putContent(text);
|
|
||||||
}
|
|
||||||
finally
|
|
||||||
{
|
|
||||||
if (pdf != null)
|
|
||||||
{
|
|
||||||
try { pdf.close(); } catch (Throwable e) {e.printStackTrace(); }
|
|
||||||
}
|
|
||||||
if (is != null)
|
|
||||||
{
|
|
||||||
try { is.close(); } catch (Throwable e) {e.printStackTrace(); }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -50,5 +50,7 @@ public class PdfBoxContentTransformerTest extends AbstractContentTransformerTest
|
|||||||
{
|
{
|
||||||
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PDF, new TransformationOptions()));
|
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PDF, new TransformationOptions()));
|
||||||
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PDF, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,69 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This file is part of Alfresco
|
||||||
|
*
|
||||||
|
* Alfresco is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Alfresco is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
|
import org.apache.tika.mime.MediaType;
|
||||||
|
import org.apache.tika.parser.Parser;
|
||||||
|
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Uses {@link http://tika.apache.org/ Apache Tika} and
|
||||||
|
* {@link http://poi.apache.org/ Apache POI} to perform
|
||||||
|
* conversions from Office documents.
|
||||||
|
*
|
||||||
|
* {@link PoiHssfContentTransformer} handles the Excel
|
||||||
|
* transformations (mostly for compatibility), while
|
||||||
|
* this does all the other Office file formats.
|
||||||
|
*
|
||||||
|
* @author Nick Burch
|
||||||
|
*/
|
||||||
|
public class PoiContentTransformer extends TikaPoweredContentTransformer
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* We support all the office mimetypes that the Tika
|
||||||
|
* office parser can handle, except for excel
|
||||||
|
* (handled by {@link PoiHssfContentTransformer}
|
||||||
|
*/
|
||||||
|
public static ArrayList<String> SUPPORTED_MIMETYPES;
|
||||||
|
static {
|
||||||
|
SUPPORTED_MIMETYPES = new ArrayList<String>();
|
||||||
|
OfficeParser p = new OfficeParser();
|
||||||
|
for(MediaType mt : p.getSupportedTypes(null)) {
|
||||||
|
if(mt.toString().equals(MimetypeMap.MIMETYPE_EXCEL))
|
||||||
|
{
|
||||||
|
// Skip, handled elsewhere
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Tika can probably do some useful text
|
||||||
|
SUPPORTED_MIMETYPES.add( mt.toString() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public PoiContentTransformer() {
|
||||||
|
super(SUPPORTED_MIMETYPES);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Parser getParser() {
|
||||||
|
return new OfficeParser();
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,69 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This file is part of Alfresco
|
||||||
|
*
|
||||||
|
* Alfresco is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Alfresco is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
|
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see org.alfresco.repo.content.transform.PoiContentTransformer
|
||||||
|
*
|
||||||
|
* @author Nick Burch
|
||||||
|
*/
|
||||||
|
public class PoiContentTransformerTest extends AbstractContentTransformerTest
|
||||||
|
{
|
||||||
|
private ContentTransformer transformer;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception
|
||||||
|
{
|
||||||
|
super.setUp();
|
||||||
|
|
||||||
|
transformer = new PoiContentTransformer();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return Returns the same transformer regardless - it is allowed
|
||||||
|
*/
|
||||||
|
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
|
||||||
|
{
|
||||||
|
return transformer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testIsTransformable() throws Exception
|
||||||
|
{
|
||||||
|
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||||
|
|
||||||
|
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PPT, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||||
|
|
||||||
|
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OUTLOOK_MSG, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
|
||||||
|
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
|
||||||
|
|
||||||
|
// Doesn't claim excel
|
||||||
|
assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
|
||||||
|
}
|
||||||
|
}
|
@@ -42,8 +42,6 @@ import org.xml.sax.SAXException;
|
|||||||
* Xml or Text (space or comma separated)
|
* Xml or Text (space or comma separated)
|
||||||
* <p>Handles all sheets in the file.
|
* <p>Handles all sheets in the file.
|
||||||
*
|
*
|
||||||
* TODO CSV Support
|
|
||||||
*
|
|
||||||
* @author Nick Burch
|
* @author Nick Burch
|
||||||
* @author Derek Hulley
|
* @author Derek Hulley
|
||||||
*/
|
*/
|
||||||
@@ -103,7 +101,6 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
|
|||||||
* A wrapper around the normal Tika BodyContentHandler,
|
* A wrapper around the normal Tika BodyContentHandler,
|
||||||
* which causes things to be CSV encoded rather than
|
* which causes things to be CSV encoded rather than
|
||||||
* tab separated
|
* tab separated
|
||||||
* TODO Get rid of the extra tabs that crop up
|
|
||||||
*/
|
*/
|
||||||
protected static class CsvContentHandler extends BodyContentHandler {
|
protected static class CsvContentHandler extends BodyContentHandler {
|
||||||
private static final char[] comma = new char[]{ ',' };
|
private static final char[] comma = new char[]{ ',' };
|
||||||
@@ -116,6 +113,16 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
|
|||||||
super(output);
|
super(output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void ignorableWhitespace(char[] ch, int start, int length)
|
||||||
|
throws SAXException {
|
||||||
|
if(length == 1 && ch[0] == '\t') {
|
||||||
|
// Ignore tabs, as they mess up the CSV output
|
||||||
|
} else {
|
||||||
|
super.ignorableWhitespace(ch, start, length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void characters(char[] ch, int start, int length)
|
public void characters(char[] ch, int start, int length)
|
||||||
throws SAXException {
|
throws SAXException {
|
||||||
@@ -150,32 +157,28 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
|
|||||||
public void startElement(String uri, String localName, String name,
|
public void startElement(String uri, String localName, String name,
|
||||||
Attributes atts) throws SAXException {
|
Attributes atts) throws SAXException {
|
||||||
if(localName.equals("td")) {
|
if(localName.equals("td")) {
|
||||||
localName = "span";
|
|
||||||
name = "span";
|
|
||||||
|
|
||||||
inCell = true;
|
inCell = true;
|
||||||
if(needsComma) {
|
if(needsComma) {
|
||||||
super.characters(comma, 0, 1);
|
super.characters(comma, 0, 1);
|
||||||
needsComma = true;
|
needsComma = true;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
super.startElement(uri, localName, name, atts);
|
||||||
}
|
}
|
||||||
super.startElement(uri, localName, name, atts);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void endElement(String uri, String localName, String name)
|
public void endElement(String uri, String localName, String name)
|
||||||
throws SAXException {
|
throws SAXException {
|
||||||
if(localName.equals("td")) {
|
if(localName.equals("td")) {
|
||||||
localName = "span";
|
|
||||||
name = "span";
|
|
||||||
|
|
||||||
needsComma = true;
|
needsComma = true;
|
||||||
inCell = false;
|
inCell = false;
|
||||||
|
} else {
|
||||||
|
if(localName.equals("tr")) {
|
||||||
|
needsComma = false;
|
||||||
|
}
|
||||||
|
super.endElement(uri, localName, name);
|
||||||
}
|
}
|
||||||
if(localName.equals("tr")) {
|
|
||||||
needsComma = false;
|
|
||||||
}
|
|
||||||
super.endElement(uri, localName, name);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -79,14 +79,26 @@ public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformer
|
|||||||
|
|
||||||
ContentReader targetReader = targetWriter.getReader();
|
ContentReader targetReader = targetWriter.getReader();
|
||||||
String checkContent = targetReader.getContentString();
|
String checkContent = targetReader.getContentString();
|
||||||
System.err.println(checkContent);
|
|
||||||
|
additionalContentCheck(
|
||||||
|
MimetypeMap.MIMETYPE_EXCEL,
|
||||||
|
MimetypeMap.MIMETYPE_TEXT_CSV,
|
||||||
|
checkContent
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void additionalContentCheck(String sourceMimetype,
|
protected void additionalContentCheck(String sourceMimetype,
|
||||||
String targetMimetype, String contents) {
|
String targetMimetype, String contents) {
|
||||||
if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
|
if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
|
||||||
System.err.println(contents);
|
assertTrue(
|
||||||
|
"Content not properly CSV'd",
|
||||||
|
contents.contains("1,2,2")
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
"Content not properly CSV'd",
|
||||||
|
contents.contains("\"The\",\"quick\",\"brown\",\"fox\"")
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
super.additionalContentCheck(sourceMimetype, targetMimetype, contents);
|
super.additionalContentCheck(sourceMimetype, targetMimetype, contents);
|
||||||
}
|
}
|
||||||
|
@@ -32,6 +32,11 @@ import org.textmining.extraction.word.WordTextExtractorFactory;
|
|||||||
* Makes use of the {@link http://www.textmining.org/ TextMining} library to
|
* Makes use of the {@link http://www.textmining.org/ TextMining} library to
|
||||||
* perform conversions from MSWord documents to text.
|
* perform conversions from MSWord documents to text.
|
||||||
*
|
*
|
||||||
|
* Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
|
||||||
|
* do this, as Tika can't handle Word 6 or Word 95 documents, only
|
||||||
|
* Word 97, 2000, 2003, 2007 and 2010.
|
||||||
|
* Once Tika does support these older formats, we can switch to it.
|
||||||
|
*
|
||||||
* @author Derek Hulley
|
* @author Derek Hulley
|
||||||
*/
|
*/
|
||||||
public class TextMiningContentTransformer extends AbstractContentTransformer2
|
public class TextMiningContentTransformer extends AbstractContentTransformer2
|
||||||
|
BIN
source/test-resources/quick/quick6.doc
Normal file
BIN
source/test-resources/quick/quick6.doc
Normal file
Binary file not shown.
BIN
source/test-resources/quick/quick95.doc
Normal file
BIN
source/test-resources/quick/quick95.doc
Normal file
Binary file not shown.
Reference in New Issue
Block a user