Merged DEV/SWIFT to HEAD (Tika and Poi)

26013: (RECORD ONLY) Upgrade POI to get initial TNEF support
   26037: (RECORD ONLY) Bump the POI version for ALF-5900, so we get almost correct RTF body decoding in TNEF files
   26193: (RECORD ONLY) Upgrade POI and Tika for ALF-5900
   26415: (RECORD ONLY) Upgrade Tika to the latest nightly version, to get a BMP fix
   27609: (RECORD ONLY) Upgrade Tika and POI for ALF-7874
   27611: (RECORD ONLY) Upgrade Tika for ALF-7978
   27612: (RECORD ONLY) Another outlook related tika update
   27865: (RECORD ONLY FOR JARS) Update Tika, and change the auto detect extractor to register aliases of the mime types along with the canonical one
Notes:
 - There is no way to verify which of the SWIFT or HEAD jars is the 'latest snapshot'
 - HEAD jars were all preserved; re-apply latest snapshots to HEAD, if required

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@28223 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Derek Hulley
2011-06-07 02:42:40 +00:00
parent 1387dcaf2a
commit 04aef409a8
2 changed files with 39 additions and 38 deletions

View File

@@ -22,6 +22,7 @@ import java.util.ArrayList;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser; import org.apache.tika.parser.Parser;
@@ -49,19 +50,35 @@ import org.apache.tika.parser.Parser;
public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter
{ {
protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class); protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class);
private static AutoDetectParser parser;
private static TikaConfig config;
public static ArrayList<String> SUPPORTED_MIMETYPES; public static ArrayList<String> SUPPORTED_MIMETYPES;
static { private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig)
{
config = tikaConfig;
parser = new AutoDetectParser(config);
SUPPORTED_MIMETYPES = new ArrayList<String>(); SUPPORTED_MIMETYPES = new ArrayList<String>();
AutoDetectParser p = new AutoDetectParser(); parser = new AutoDetectParser();
for(MediaType mt : p.getParsers().keySet()) { for(MediaType mt : parser.getParsers().keySet())
{
// Add the canonical mime type
SUPPORTED_MIMETYPES.add( mt.toString() ); SUPPORTED_MIMETYPES.add( mt.toString() );
// And add any aliases of the mime type too - Alfresco uses some
// non canonical forms of various mimetypes, so we need all of them
for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt))
{
SUPPORTED_MIMETYPES.add( alias.toString() );
}
} }
return SUPPORTED_MIMETYPES;
} }
public TikaAutoMetadataExtracter() public TikaAutoMetadataExtracter(TikaConfig tikaConfig)
{ {
super(SUPPORTED_MIMETYPES); super( buildMimeTypes(tikaConfig) );
} }
/** /**
@@ -70,6 +87,6 @@ public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter
*/ */
@Override @Override
protected Parser getParser() { protected Parser getParser() {
return new AutoDetectParser(); return parser;
} }
} }

View File

@@ -63,7 +63,9 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
public void setUp() throws Exception public void setUp() throws Exception
{ {
super.setUp(); super.setUp();
extracter = new TikaAutoMetadataExtracter();
TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
extracter = new TikaAutoMetadataExtracter(config);
extracter.setDictionaryService(dictionaryService); extracter.setDictionaryService(dictionaryService);
extracter.register(); extracter.register();
@@ -91,37 +93,17 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
public void testSupports() throws Exception public void testSupports() throws Exception
{ {
TikaConfig config = TikaConfig.getDefaultConfig();
ArrayList<String> mimeTypes = new ArrayList<String>(); ArrayList<String> mimeTypes = new ArrayList<String>();
for (Parser p : new Parser[] { for (Parser p : new Parser[] {
new OfficeParser(), new OpenDocumentParser(), new OfficeParser(), new OpenDocumentParser(),
new Mp3Parser(), new OOXMLParser() new Mp3Parser(), new OOXMLParser()
}) { }) {
Set<MediaType> mts = p.getSupportedTypes(new ParseContext()); Set<MediaType> mts = p.getSupportedTypes(new ParseContext());
for (MediaType mt : mts) for (MediaType mt : mts) {
{ mimeTypes.add(mt.toString());
MediaType canonical = config.getMediaTypeRegistry().normalize(mt);
mimeTypes.add( canonical.toString() );
} }
} }
// Check Tika handles it properly
AutoDetectParser p = new AutoDetectParser();
Set<String> amts = new HashSet<String>();
for (MediaType mt : p.getSupportedTypes(new ParseContext()))
{
amts.add(mt.toString());
}
for (String mimetype : mimeTypes)
{
assertTrue(
"Tika doesn't support expected mimetype: " + mimetype,
amts.contains(mimetype)
);
}
// Now check the extractor does too
for (String mimetype : mimeTypes) for (String mimetype : mimeTypes)
{ {
boolean supports = extracter.isSupported(mimetype); boolean supports = extracter.isSupported(mimetype);
@@ -228,8 +210,7 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
assertEquals("8 8 8", p.get("Data BitsPerSample")); assertEquals("8 8 8", p.get("Data BitsPerSample"));
assertEquals("none", p.get("Transparency Alpha")); assertEquals("none", p.get("Transparency Alpha"));
//p = openAndCheck(".bmp", "image/bmp"); // TODO Fixed in Swift, p = openAndCheck(".bmp", "image/bmp");
p = openAndCheck(".bmp", "image/x-ms-bmp"); // TODO Pre-swift workaround
assertEquals("409", p.get("width")); assertEquals("409", p.get("width"));
assertEquals("92", p.get("height")); assertEquals("92", p.get("height"));
assertEquals("8 8 8", p.get("Data BitsPerSample")); assertEquals("8 8 8", p.get("Data BitsPerSample"));
@@ -284,19 +265,22 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
return file; return file;
} }
private Map<String, Serializable> openAndCheck(String fileBase, String expMimeType) throws Throwable { private Map<String, Serializable> openAndCheck(String fileBase, String expMimeType) throws Throwable {
// Cheat and ask Tika for the mime type! // Get the mimetype via the MimeTypeMap
// (Uses Tika internally for the detection)
File file = open(fileBase); File file = open(fileBase);
AutoDetectParser ap = new AutoDetectParser(); ContentReader detectReader = new FileContentReader(file);
Metadata metadata = new Metadata(); String mimetype = mimetypeMap.guessMimetype(fileBase, detectReader);
metadata.set(Metadata.RESOURCE_NAME_KEY, "quick"+fileBase);
MediaType mt = ap.getDetector().detect(
new BufferedInputStream(new FileInputStream(file)), metadata);
String mimetype = mt.toString();
assertEquals("Wrong mimetype for " + fileBase, mimetype, expMimeType); assertEquals("Wrong mimetype for " + fileBase, mimetype, expMimeType);
// Ensure the Tika Auto parser actually handles this
assertTrue("Mimetype should be supported but isn't: " + mimetype, extracter.isSupported(mimetype));
// Now create our proper reader
ContentReader sourceReader = new FileContentReader(file); ContentReader sourceReader = new FileContentReader(file);
sourceReader.setMimetype(mimetype); sourceReader.setMimetype(mimetype);
// And finally do the properties extraction
return extracter.extractRaw(sourceReader); return extracter.extractRaw(sourceReader);
} }
} }