Merged DEV/SWIFT to HEAD (FTP Tests, Tika and Poi)

26059: ALF-5900 - IMAP creates winmail.dat in attachment folder (Add support for Microsoft Transport Neutral Encapsulation Format.)
          - added attachment extraction for TNEF documents - goodbye winmail.dat !
   26063: javadoc for imap.
   26088: ALF-7408 - addition of commons-net for ftp client library.
          First test of end to end ftp.   Just a simple test of connection now, will be followed by more detailed tests.
   26176: ALF-7408 - FTP tests + disabled failing test case for ALF-7618
   26180: ALF-7618 - correction of unit test error.
   26188: ALF-7618 - added a test of paths
   26229: Added back simple '\~.*' pattern
   26288: ALF-7676 - Test to stress different user rights.
          - FTPServerTest.testTwoUserUpdate added for the FTP server.
   26304: Corrected spelling name in private class.
   26408: addming minimal package infos.
   26416: ALF-5082 / ALF-2183 / ALF-4448 - When guessing the mimetype for a file, add the option to supply a ContentReader to enhance the accuracy.
          Enable this for a few key places that do mimetype guessing,
          which should avoid issues for files with the wrong extension (either renamed accidently, or for .TMP)
   26433: Re-order the mimetype guess step to ensure that the Content Reader is always valid
   26440: Added another test for word 2003 save as.
   26441: Test resource for ContentDiskDriver
   26446: ALF-5082 - Back out a FileFolderService change to mimetype guessing, which had broken things, pending a better way to do it with ContentWriter
   26490: Small change for ContentDiskDriverTes.fileExists.  Leaky transaction causing problems in automated build.
   26497: ContentDiskDriver - commented out two of the problematic leaky transaction tests.
   26503: Add new interface methods + documentation for asking a ContentWriter to guess the mimetype and encoding for you.
          (Code will be migrated from places that currently do this themselves later)
   26504: Add an extension interface in the DataModel project for some of the extra ContentReader methods that FileContentReader provides
   26505: When ContentWriter.putContent(String) is called with no encoding specified, record what the system default encoding was that was used.
          (Prevents issues if the system default is ever changed)
   26509: When calling Tika to do file detection, if we have a file based reader then give Tika the File rather than an InputStream
   26522: More debug logging while debugging ALF-5260
   26546: Have one copy of the Tika Config in spring, rather than several places fetching their own copy of the default one (either explicitly or implicitly).
   26522: More debug logging while diagnosing ALF-5260
   26548: Add another mimetype check - ensures that truncated/corrup container files which can't be fully processed can still get the container type without failure
   26549: Implement the mimetype and encoding guessers on ContentWriter (either immediately or as a listener, as required), and update FileFolderServer to make use of this (+test this)
   26553: Replace explicit mimetype and encoding guess calls with ContentWriter requests to have the work done
   26554: Replace explicit mimetype and encoding guess calls with ContentWriter requests to have the work done
   26579: Switch the transformer to use Tika

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@28224 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Derek Hulley
2011-06-07 07:36:37 +00:00
parent 04aef409a8
commit e118211bd3
50 changed files with 2269 additions and 365 deletions

View File

@@ -23,6 +23,7 @@ import java.util.ArrayList;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
@@ -50,22 +51,11 @@ public class ArchiveContentTransformer extends TikaPoweredContentTransformer
private static Log logger = LogFactory.getLog(ArchiveContentTransformer.class);
private boolean includeContents = false;
public void setIncludeContents(String includeContents)
{
// Spring really ought to be able to handle
// setting a boolean that might still be
// ${foo} (i.e. not overridden in a property).
// As we can't do that with spring, we do it...
this.includeContents = false;
if(includeContents != null && includeContents.length() > 0)
{
this.includeContents = TransformationOptions.relaxedBooleanTypeConverter.convert(includeContents).booleanValue();
}
}
private TikaConfig tikaConfig;
/**
* We support all the archive mimetypes that the Tika
* office parser can handle
* package parser can handle
*/
public static ArrayList<String> SUPPORTED_MIMETYPES;
static {
@@ -81,6 +71,29 @@ public class ArchiveContentTransformer extends TikaPoweredContentTransformer
super(SUPPORTED_MIMETYPES);
}
/**
* Injects the TikaConfig to use
*
* @param tikaConfig The Tika Config to use
*/
public void setTikaConfig(TikaConfig tikaConfig)
{
this.tikaConfig = tikaConfig;
}
public void setIncludeContents(String includeContents)
{
// Spring really ought to be able to handle
// setting a boolean that might still be
// ${foo} (i.e. not overridden in a property).
// As we can't do that with spring, we do it...
this.includeContents = false;
if(includeContents != null && includeContents.length() > 0)
{
this.includeContents = TransformationOptions.relaxedBooleanTypeConverter.convert(includeContents).booleanValue();
}
}
@Override
protected Parser getParser() {
return new PackageParser();
@@ -96,9 +109,15 @@ public class ArchiveContentTransformer extends TikaPoweredContentTransformer
{
recurse = options.getIncludeEmbedded();
}
if(recurse)
{
context.set(Parser.class, new AutoDetectParser());
// Use an auto detect parser to handle the contents
if(tikaConfig == null)
{
tikaConfig = TikaConfig.getDefaultConfig();
}
context.set(Parser.class, new AutoDetectParser(tikaConfig));
}
return context;

View File

@@ -18,94 +18,27 @@
*/
package org.alfresco.repo.content.transform;
import java.io.IOException;
import java.io.InputStream;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
/**
* This badly named transformer turns Microsoft Word documents
* (Word 6, 95, 97, 2000, 2003) into plain text.
*
* Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
* do this, pending TIKA-408. When Apache POI 3.7 beta 2 has been
* released, we can switch to Tika and then handle Word 6,
* Word 95, Word 97, 2000, 2003, 2007 and 2010 formats.
*
* TODO Switch to Tika in November 2010 once 3.4 is out
* (Word 6, 95, 97, 2000, 2003) into plain text, using Apache Tika.
*
* @author Nick Burch
*/
public class TextMiningContentTransformer extends AbstractContentTransformer2
public class TextMiningContentTransformer extends TikaPoweredContentTransformer
{
public TextMiningContentTransformer()
{
}
/**
* Currently the only transformation performed is that of text extraction from Word documents.
*/
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
{
if (!MimetypeMap.MIMETYPE_WORD.equals(sourceMimetype) ||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
{
// only support DOC -> Text
return false;
}
else
{
return true;
}
{
super(new String[] {
MimetypeMap.MIMETYPE_WORD
});
}
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
throws Exception
{
POIOLE2TextExtractor extractor = null;
InputStream is = null;
String text = null;
try
{
is = reader.getContentInputStream();
POIFSFileSystem fs = new POIFSFileSystem(is);
try {
extractor = new WordExtractor(fs);
} catch(OldWordFileFormatException e) {
extractor = new Word6Extractor(fs);
}
text = extractor.getText();
}
catch (IOException e)
{
// check if this is an error caused by the fact that the .doc is in fact
// one of Word's temp non-documents
if (e.getMessage().contains("Unable to read entire header"))
{
// just assign an empty string
text = "";
}
else
{
throw e;
}
}
finally
{
if (is != null)
{
is.close();
}
}
// dump the text out. This will close the writer automatically.
writer.putContent(text);
@Override
protected Parser getParser() {
return new OfficeParser();
}
}

View File

@@ -20,6 +20,7 @@ package org.alfresco.repo.content.transform;
import java.util.ArrayList;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
@@ -37,6 +38,9 @@ import org.apache.tika.parser.Parser;
*/
public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
{
private static AutoDetectParser parser;
private static TikaConfig config;
/**
* We support all the mimetypes that the Tika
* auto-detect parser can handle, except for
@@ -44,10 +48,13 @@ public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
* make much sense
*/
public static ArrayList<String> SUPPORTED_MIMETYPES;
static {
private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig)
{
config = tikaConfig;
parser = new AutoDetectParser(config);
SUPPORTED_MIMETYPES = new ArrayList<String>();
AutoDetectParser p = new AutoDetectParser();
for(MediaType mt : p.getParsers().keySet()) {
for(MediaType mt : parser.getParsers().keySet()) {
if(mt.toString().startsWith("application/vnd.oasis.opendocument.formula")) {
// TODO Tika support for quick.odf, mimetype=application/vnd.oasis.opendocument.formula
// TODO Tika support for quick.otf, mimetype=application/vnd.oasis.opendocument.formula-template
@@ -85,11 +92,12 @@ public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
SUPPORTED_MIMETYPES.add( mt.toString() );
}
}
return SUPPORTED_MIMETYPES;
}
public TikaAutoContentTransformer()
public TikaAutoContentTransformer(TikaConfig tikaConfig)
{
super(SUPPORTED_MIMETYPES);
super( buildMimeTypes(tikaConfig) );
}
/**
@@ -100,6 +108,6 @@ public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
*/
protected Parser getParser()
{
return new AutoDetectParser();
return parser;
}
}

View File

@@ -20,6 +20,7 @@ package org.alfresco.repo.content.transform;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.tika.config.TikaConfig;
/**
* Most of the work for testing the Tika Auto-Detect transformer
@@ -38,7 +39,8 @@ public class TikaAutoContentTransformerTest extends TikaPoweredContentTransforme
{
super.setUp();
transformer = new TikaAutoContentTransformer();
TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
transformer = new TikaAutoContentTransformer( config );
}
/**

View File

@@ -79,18 +79,10 @@ public class TikaPoweredContainerExtractor
private NodeService nodeService;
private ContentService contentService;
private TikaConfig config;
private AutoDetectParser parser;
private Detector detector;
public TikaPoweredContainerExtractor()
{
TikaConfig config = TikaConfig.getDefaultConfig();
detector = new ContainerAwareDetector(
config.getMimeRepository()
);
parser = new AutoDetectParser(detector);
}
/**
* Injects the nodeService bean.
*
@@ -110,6 +102,22 @@ public class TikaPoweredContainerExtractor
{
this.contentService = contentService;
}
/**
* Injects the TikaConfig to use
*
* @param tikaConfig The Tika Config to use
*/
public void setTikaConfig(TikaConfig tikaConfig)
{
this.config = tikaConfig;
// Setup the detector and parser
detector = new ContainerAwareDetector(
config.getMimeRepository()
);
parser = new AutoDetectParser(detector);
}
/**
* Extracts out all the entries from the container
@@ -277,6 +285,9 @@ public class TikaPoweredContainerExtractor
<property name="contentService">
<ref bean="ContentService" />
</property>
<property name="tikaConfig">
<bean class="org.apache.tika.config.TikaConfig" factory-method="getDefaultConfig" />
</property>
</bean>
<bean id="extractEmbeddedResources" class="org.alfresco.repo.content.transform.TikaPoweredContainerExtractor$ExtractorActionExecutor" parent="action-executer">
<property name="tikaPoweredContainerExtractor">