mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-08-07 17:49:17 +00:00
Remove the un-supported Text Mining library
Word -> Text is now handled by POI, which has been upgraded to include Word 6 and Word 95 support. (In a month or so we can switch to Tika, but that needs another formal POI beta release to occur first) git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20928 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -25,27 +25,28 @@ import org.alfresco.repo.content.MimetypeMap;
|
|||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||||
import org.textmining.extraction.TextExtractor;
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
import org.textmining.extraction.word.WordTextExtractorFactory;
|
import org.apache.poi.hwpf.OldWordFileFormatException;
|
||||||
|
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
||||||
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Makes use of the {@link http://www.textmining.org/ TextMining} library to
|
* This badly named transformer turns Microsoft Word documents
|
||||||
* perform conversions from MSWord documents to text.
|
* (Word 6, 95, 97, 2000, 2003) into plain text.
|
||||||
*
|
*
|
||||||
* Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
|
* Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
|
||||||
* do this, as Tika can't handle Word 6 or Word 95 documents, only
|
* do this, pending TIKA-408. When Apache POI 3.7 beta 2 has been
|
||||||
* Word 97, 2000, 2003, 2007 and 2010.
|
* released, we can switch to Tika and then handle Word 6,
|
||||||
* Once Tika does support these older formats, we can switch to it.
|
* Word 95, Word 97, 2000, 2003, 2007 and 2010 formats.
|
||||||
|
* TODO Switch to Tika in August 2010
|
||||||
*
|
*
|
||||||
* @author Derek Hulley
|
* @author Nick Burch
|
||||||
*/
|
*/
|
||||||
public class TextMiningContentTransformer extends AbstractContentTransformer2
|
public class TextMiningContentTransformer extends AbstractContentTransformer2
|
||||||
{
|
{
|
||||||
private WordTextExtractorFactory wordExtractorFactory;
|
|
||||||
|
|
||||||
public TextMiningContentTransformer()
|
public TextMiningContentTransformer()
|
||||||
{
|
{
|
||||||
this.wordExtractorFactory = new WordTextExtractorFactory();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -68,13 +69,19 @@ public class TextMiningContentTransformer extends AbstractContentTransformer2
|
|||||||
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
|
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
|
||||||
throws Exception
|
throws Exception
|
||||||
{
|
{
|
||||||
|
POIOLE2TextExtractor extractor = null;
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
String text = null;
|
String text = null;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
is = reader.getContentInputStream();
|
is = reader.getContentInputStream();
|
||||||
TextExtractor te = wordExtractorFactory.textExtractor(is);
|
POIFSFileSystem fs = new POIFSFileSystem(is);
|
||||||
text = te.getText();
|
try {
|
||||||
|
extractor = new WordExtractor(fs);
|
||||||
|
} catch(OldWordFileFormatException e) {
|
||||||
|
extractor = new Word6Extractor(fs);
|
||||||
|
}
|
||||||
|
text = extractor.getText();
|
||||||
}
|
}
|
||||||
catch (IOException e)
|
catch (IOException e)
|
||||||
{
|
{
|
||||||
|
@@ -29,6 +29,8 @@ import org.alfresco.util.TempFileProvider;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* @see org.alfresco.repo.content.transform.TextMiningContentTransformer
|
* @see org.alfresco.repo.content.transform.TextMiningContentTransformer
|
||||||
|
* Note - Is actually POI (soon to be Tika), and not the
|
||||||
|
* old and unsupported Text Mining library!
|
||||||
*
|
*
|
||||||
* @author Derek Hulley
|
* @author Derek Hulley
|
||||||
*/
|
*/
|
||||||
|
Reference in New Issue
Block a user