mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-08-07 17:49:17 +00:00
Remove the un-supported Text Mining library
Word -> Text is now handled by POI, which has been upgraded to include Word 6 and Word 95 support. (In a month or so we can switch to Tika, but that needs another formal POI beta release to occur first) git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20928 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -25,27 +25,28 @@ import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.textmining.extraction.TextExtractor;
|
||||
import org.textmining.extraction.word.WordTextExtractorFactory;
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hwpf.OldWordFileFormatException;
|
||||
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
* Makes use of the {@link http://www.textmining.org/ TextMining} library to
|
||||
* perform conversions from MSWord documents to text.
|
||||
* This badly named transformer turns Microsoft Word documents
|
||||
* (Word 6, 95, 97, 2000, 2003) into plain text.
|
||||
*
|
||||
* Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
|
||||
* do this, as Tika can't handle Word 6 or Word 95 documents, only
|
||||
* Word 97, 2000, 2003, 2007 and 2010.
|
||||
* Once Tika does support these older formats, we can switch to it.
|
||||
* do this, pending TIKA-408. When Apache POI 3.7 beta 2 has been
|
||||
* released, we can switch to Tika and then handle Word 6,
|
||||
* Word 95, Word 97, 2000, 2003, 2007 and 2010 formats.
|
||||
* TODO Switch to Tika in August 2010
|
||||
*
|
||||
* @author Derek Hulley
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class TextMiningContentTransformer extends AbstractContentTransformer2
|
||||
{
|
||||
private WordTextExtractorFactory wordExtractorFactory;
|
||||
|
||||
public TextMiningContentTransformer()
|
||||
{
|
||||
this.wordExtractorFactory = new WordTextExtractorFactory();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -68,13 +69,19 @@ public class TextMiningContentTransformer extends AbstractContentTransformer2
|
||||
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
|
||||
throws Exception
|
||||
{
|
||||
POIOLE2TextExtractor extractor = null;
|
||||
InputStream is = null;
|
||||
String text = null;
|
||||
try
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
TextExtractor te = wordExtractorFactory.textExtractor(is);
|
||||
text = te.getText();
|
||||
POIFSFileSystem fs = new POIFSFileSystem(is);
|
||||
try {
|
||||
extractor = new WordExtractor(fs);
|
||||
} catch(OldWordFileFormatException e) {
|
||||
extractor = new Word6Extractor(fs);
|
||||
}
|
||||
text = extractor.getText();
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
|
@@ -29,6 +29,8 @@ import org.alfresco.util.TempFileProvider;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.TextMiningContentTransformer
|
||||
* Note - Is actually POI (soon to be Tika), and not the
|
||||
* old and unsupported Text Mining library!
|
||||
*
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
|
Reference in New Issue
Block a user