From 04aef409a8a39965a106f19c40dde0aa1a28da10 Mon Sep 17 00:00:00 2001 From: Derek Hulley Date: Tue, 7 Jun 2011 02:42:40 +0000 Subject: [PATCH] Merged DEV/SWIFT to HEAD (Tika and Poi) 26013: (RECORD ONLY) Upgrade POI to get initial TNEF support 26037: (RECORD ONLY) Bump the POI version for ALF-5900, so we get almost correct RTF body decoding in TNEF files 26193: (RECORD ONLY) Upgrade POI and Tika for ALF-5900 26415: (RECORD ONLY) Upgrade Tika to the latest nightly version, to get a BMP fix 27609: (RECORD ONLY) Upgrade Tika and POI for ALF-7874 27611: (RECORD ONLY) Upgrade Tika for ALF-7978 27612: (RECORD ONLY) Another outlook related tika update 27865: (RECORD ONLY FOR JARS) Update Tika, and change the auto detect extractor to register aliases of the mime types along with the canonical one Notes: - There is no way to verify which of the SWIFT or HEAD jars is the 'latest snapshot' - HEAD jars were all preserved; re-apply latest snapshots to HEAD, if required git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@28223 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- .../metadata/TikaAutoMetadataExtracter.java | 29 ++++++++--- .../TikaAutoMetadataExtracterTest.java | 48 +++++++------------ 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java index 7dd75d8c54..7eac6eac28 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tika.config.TikaConfig; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; @@ -49,19 +50,35 @@ import org.apache.tika.parser.Parser; public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter { protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class); + private static AutoDetectParser parser; + private static TikaConfig config; public static ArrayList SUPPORTED_MIMETYPES; - static { + private static ArrayList buildMimeTypes(TikaConfig tikaConfig) + { + config = tikaConfig; + parser = new AutoDetectParser(config); + SUPPORTED_MIMETYPES = new ArrayList(); - AutoDetectParser p = new AutoDetectParser(); - for(MediaType mt : p.getParsers().keySet()) { + parser = new AutoDetectParser(); + for(MediaType mt : parser.getParsers().keySet()) + { + // Add the canonical mime type SUPPORTED_MIMETYPES.add( mt.toString() ); + + // And add any aliases of the mime type too - Alfresco uses some + // non canonical forms of various mimetypes, so we need all of them + for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt)) + { + SUPPORTED_MIMETYPES.add( alias.toString() ); + } } + return SUPPORTED_MIMETYPES; } - public TikaAutoMetadataExtracter() + public TikaAutoMetadataExtracter(TikaConfig tikaConfig) { - super(SUPPORTED_MIMETYPES); + super( buildMimeTypes(tikaConfig) ); } /** @@ -70,6 +87,6 @@ public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter */ @Override protected Parser getParser() { - return new AutoDetectParser(); + return parser; } } diff --git a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java index eb37a0c539..fed3db396c 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java @@ -63,7 +63,9 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest public void setUp() throws Exception { super.setUp(); - extracter = new TikaAutoMetadataExtracter(); + + TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig"); + extracter = new TikaAutoMetadataExtracter(config); extracter.setDictionaryService(dictionaryService); extracter.register(); @@ -91,37 +93,17 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest public void testSupports() throws Exception { - TikaConfig config = TikaConfig.getDefaultConfig(); - ArrayList mimeTypes = new ArrayList(); for (Parser p : new Parser[] { new OfficeParser(), new OpenDocumentParser(), new Mp3Parser(), new OOXMLParser() }) { Set mts = p.getSupportedTypes(new ParseContext()); - for (MediaType mt : mts) - { - MediaType canonical = config.getMediaTypeRegistry().normalize(mt); - mimeTypes.add( canonical.toString() ); + for (MediaType mt : mts) { + mimeTypes.add(mt.toString()); } } - // Check Tika handles it properly - AutoDetectParser p = new AutoDetectParser(); - Set amts = new HashSet(); - for (MediaType mt : p.getSupportedTypes(new ParseContext())) - { - amts.add(mt.toString()); - } - for (String mimetype : mimeTypes) - { - assertTrue( - "Tika doesn't support expected mimetype: " + mimetype, - amts.contains(mimetype) - ); - } - - // Now check the extractor does too for (String mimetype : mimeTypes) { boolean supports = extracter.isSupported(mimetype); @@ -228,8 +210,7 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest assertEquals("8 8 8", p.get("Data BitsPerSample")); assertEquals("none", p.get("Transparency Alpha")); - //p = openAndCheck(".bmp", "image/bmp"); // TODO Fixed in Swift, - p = openAndCheck(".bmp", "image/x-ms-bmp"); // TODO Pre-swift workaround + p = openAndCheck(".bmp", "image/bmp"); assertEquals("409", p.get("width")); assertEquals("92", p.get("height")); assertEquals("8 8 8", p.get("Data BitsPerSample")); @@ -284,19 +265,22 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest return file; } private Map openAndCheck(String fileBase, String expMimeType) throws Throwable { - // Cheat and ask Tika for the mime type! + // Get the mimetype via the MimeTypeMap + // (Uses Tika internally for the detection) File file = open(fileBase); - AutoDetectParser ap = new AutoDetectParser(); - Metadata metadata = new Metadata(); - metadata.set(Metadata.RESOURCE_NAME_KEY, "quick"+fileBase); - MediaType mt = ap.getDetector().detect( - new BufferedInputStream(new FileInputStream(file)), metadata); - String mimetype = mt.toString(); + ContentReader detectReader = new FileContentReader(file); + String mimetype = mimetypeMap.guessMimetype(fileBase, detectReader); assertEquals("Wrong mimetype for " + fileBase, mimetype, expMimeType); + // Ensure the Tika Auto parser actually handles this + assertTrue("Mimetype should be supported but isn't: " + mimetype, extracter.isSupported(mimetype)); + + // Now create our proper reader ContentReader sourceReader = new FileContentReader(file); sourceReader.setMimetype(mimetype); + + // And finally do the properties extraction return extracter.extractRaw(sourceReader); } }