Remove the un-supported Text Mining library

Word -> Text is now handled by POI, which has been upgraded to include Word 6 and Word 95 support. (In a month or so we can switch to Tika, but that needs another formal POI beta release to occur first)


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20928 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-07-02 22:20:57 +00:00
parent d2c1cc78e5
commit 8f681ca8d3
2 changed files with 22 additions and 13 deletions

View File

@@ -25,27 +25,28 @@ import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.textmining.extraction.TextExtractor;
import org.textmining.extraction.word.WordTextExtractorFactory;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Makes use of the {@link http://www.textmining.org/ TextMining} library to
* perform conversions from MSWord documents to text.
* This badly named transformer turns Microsoft Word documents
* (Word 6, 95, 97, 2000, 2003) into plain text.
*
* Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
* do this, as Tika can't handle Word 6 or Word 95 documents, only
* Word 97, 2000, 2003, 2007 and 2010.
* Once Tika does support these older formats, we can switch to it.
* do this, pending TIKA-408. When Apache POI 3.7 beta 2 has been
* released, we can switch to Tika and then handle Word 6,
* Word 95, Word 97, 2000, 2003, 2007 and 2010 formats.
* TODO Switch to Tika in August 2010
*
* @author Derek Hulley
* @author Nick Burch
*/
public class TextMiningContentTransformer extends AbstractContentTransformer2
{
private WordTextExtractorFactory wordExtractorFactory;
public TextMiningContentTransformer()
{
this.wordExtractorFactory = new WordTextExtractorFactory();
}
/**
@@ -68,13 +69,19 @@ public class TextMiningContentTransformer extends AbstractContentTransformer2
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
throws Exception
{
POIOLE2TextExtractor extractor = null;
InputStream is = null;
String text = null;
try
{
is = reader.getContentInputStream();
TextExtractor te = wordExtractorFactory.textExtractor(is);
text = te.getText();
POIFSFileSystem fs = new POIFSFileSystem(is);
try {
extractor = new WordExtractor(fs);
} catch(OldWordFileFormatException e) {
extractor = new Word6Extractor(fs);
}
text = extractor.getText();
}
catch (IOException e)
{

View File

@@ -29,6 +29,8 @@ import org.alfresco.util.TempFileProvider;
/**
* @see org.alfresco.repo.content.transform.TextMiningContentTransformer
* Note - Is actually POI (soon to be Tika), and not the
* old and unsupported Text Mining library!
*
* @author Derek Hulley
*/