Remove the un-supported Text Mining library

Word -> Text is now handled by POI, which has been upgraded to include Word 6 and Word 95 support. (In a month or so we can switch to Tika, but that needs another formal POI beta release to occur first)


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20928 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-07-02 22:20:57 +00:00
parent d2c1cc78e5
commit 8f681ca8d3
2 changed files with 22 additions and 13 deletions

View File

@@ -25,27 +25,28 @@ import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions; import org.alfresco.service.cmr.repository.TransformationOptions;
import org.textmining.extraction.TextExtractor; import org.apache.poi.POIOLE2TextExtractor;
import org.textmining.extraction.word.WordTextExtractorFactory; import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/** /**
* Makes use of the {@link http://www.textmining.org/ TextMining} library to * This badly named transformer turns Microsoft Word documents
* perform conversions from MSWord documents to text. * (Word 6, 95, 97, 2000, 2003) into plain text.
* *
* Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to * Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
* do this, as Tika can't handle Word 6 or Word 95 documents, only * do this, pending TIKA-408. When Apache POI 3.7 beta 2 has been
* Word 97, 2000, 2003, 2007 and 2010. * released, we can switch to Tika and then handle Word 6,
* Once Tika does support these older formats, we can switch to it. * Word 95, Word 97, 2000, 2003, 2007 and 2010 formats.
* TODO Switch to Tika in August 2010
* *
* @author Derek Hulley * @author Nick Burch
*/ */
public class TextMiningContentTransformer extends AbstractContentTransformer2 public class TextMiningContentTransformer extends AbstractContentTransformer2
{ {
private WordTextExtractorFactory wordExtractorFactory;
public TextMiningContentTransformer() public TextMiningContentTransformer()
{ {
this.wordExtractorFactory = new WordTextExtractorFactory();
} }
/** /**
@@ -68,13 +69,19 @@ public class TextMiningContentTransformer extends AbstractContentTransformer2
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options) public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
throws Exception throws Exception
{ {
POIOLE2TextExtractor extractor = null;
InputStream is = null; InputStream is = null;
String text = null; String text = null;
try try
{ {
is = reader.getContentInputStream(); is = reader.getContentInputStream();
TextExtractor te = wordExtractorFactory.textExtractor(is); POIFSFileSystem fs = new POIFSFileSystem(is);
text = te.getText(); try {
extractor = new WordExtractor(fs);
} catch(OldWordFileFormatException e) {
extractor = new Word6Extractor(fs);
}
text = extractor.getText();
} }
catch (IOException e) catch (IOException e)
{ {

View File

@@ -29,6 +29,8 @@ import org.alfresco.util.TempFileProvider;
/** /**
* @see org.alfresco.repo.content.transform.TextMiningContentTransformer * @see org.alfresco.repo.content.transform.TextMiningContentTransformer
* Note - Is actually POI (soon to be Tika), and not the
* old and unsupported Text Mining library!
* *
* @author Derek Hulley * @author Derek Hulley
*/ */