Remove the un-supported Text Mining library

Word -> Text is now handled by POI, which has been upgraded to include Word 6 and Word 95 support. (In a month or so we can switch to Tika, but that needs another formal POI beta release to occur first) git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20928 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-08-07 17:49:17 +00:00 · 2010-07-02 22:20:57 +00:00
parent d2c1cc78e5
commit 8f681ca8d3
2 changed files with 22 additions and 13 deletions
--- a/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java
@@ -25,27 +25,28 @@ import org.alfresco.repo.content.MimetypeMap;
 import org.alfresco.service.cmr.repository.ContentReader;
 import org.alfresco.service.cmr.repository.ContentWriter;
 import org.alfresco.service.cmr.repository.TransformationOptions;
-import org.textmining.extraction.TextExtractor;
-import org.textmining.extraction.word.WordTextExtractorFactory;
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.poi.hwpf.extractor.Word6Extractor;
+import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;

 /**
- * Makes use of the {@link http://www.textmining.org/ TextMining} library to
- * perform conversions from MSWord documents to text.
+ * This badly named transformer turns Microsoft Word documents
+ *  (Word 6, 95, 97, 2000, 2003) into plain text.
 * 
 * Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
- *  do this, as Tika can't handle Word 6 or Word 95 documents, only
- *  Word 97, 2000, 2003, 2007 and 2010.
- * Once Tika does support these older formats, we can switch to it.
+ *  do this, pending TIKA-408. When Apache POI 3.7 beta 2 has been
+ *  released, we can switch to Tika and then handle Word 6,
+ *  Word 95, Word 97, 2000, 2003, 2007 and 2010 formats.
+ * TODO Switch to Tika in August 2010
 * 
- * @author Derek Hulley
+ * @author Nick Burch
 */
 public class TextMiningContentTransformer extends AbstractContentTransformer2
 {
-    private WordTextExtractorFactory wordExtractorFactory;
-    
    public TextMiningContentTransformer()
    {
-        this.wordExtractorFactory = new WordTextExtractorFactory();
    }
    
    /**
@@ -68,13 +69,19 @@ public class TextMiningContentTransformer extends AbstractContentTransformer2
    public void transformInternal(ContentReader reader, ContentWriter writer,  TransformationOptions options)
            throws Exception
    {
+       POIOLE2TextExtractor extractor = null;
        InputStream is = null;
        String text = null;
        try
        {
            is = reader.getContentInputStream();
-            TextExtractor te = wordExtractorFactory.textExtractor(is);
-            text = te.getText();
+            POIFSFileSystem fs = new POIFSFileSystem(is);
+            try {
+               extractor = new WordExtractor(fs);
+            } catch(OldWordFileFormatException e) {
+               extractor = new Word6Extractor(fs);
+            }
+            text = extractor.getText();
        }
        catch (IOException e)
        {
--- a/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformerTest.java
@@ -29,6 +29,8 @@ import org.alfresco.util.TempFileProvider;

 /**
 * @see org.alfresco.repo.content.transform.TextMiningContentTransformer
+ * Note - Is actually POI (soon to be Tika), and not the
+ *  old and unsupported Text Mining library!
 * 
 * @author Derek Hulley
 */