diff --git a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java index 7dd75d8c54..7eac6eac28 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tika.config.TikaConfig; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; @@ -49,19 +50,35 @@ import org.apache.tika.parser.Parser; public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter { protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class); + private static AutoDetectParser parser; + private static TikaConfig config; public static ArrayList SUPPORTED_MIMETYPES; - static { + private static ArrayList buildMimeTypes(TikaConfig tikaConfig) + { + config = tikaConfig; + parser = new AutoDetectParser(config); + SUPPORTED_MIMETYPES = new ArrayList(); - AutoDetectParser p = new AutoDetectParser(); - for(MediaType mt : p.getParsers().keySet()) { + parser = new AutoDetectParser(); + for(MediaType mt : parser.getParsers().keySet()) + { + // Add the canonical mime type SUPPORTED_MIMETYPES.add( mt.toString() ); + + // And add any aliases of the mime type too - Alfresco uses some + // non canonical forms of various mimetypes, so we need all of them + for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt)) + { + SUPPORTED_MIMETYPES.add( alias.toString() ); + } } + return SUPPORTED_MIMETYPES; } - public TikaAutoMetadataExtracter() + public TikaAutoMetadataExtracter(TikaConfig tikaConfig) { - super(SUPPORTED_MIMETYPES); + super( buildMimeTypes(tikaConfig) ); } /** @@ -70,6 +87,6 @@ public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter */ @Override protected Parser getParser() { - return new AutoDetectParser(); + return parser; } } diff --git a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java index eb37a0c539..fed3db396c 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java @@ -63,7 +63,9 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest public void setUp() throws Exception { super.setUp(); - extracter = new TikaAutoMetadataExtracter(); + + TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig"); + extracter = new TikaAutoMetadataExtracter(config); extracter.setDictionaryService(dictionaryService); extracter.register(); @@ -91,37 +93,17 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest public void testSupports() throws Exception { - TikaConfig config = TikaConfig.getDefaultConfig(); - ArrayList mimeTypes = new ArrayList(); for (Parser p : new Parser[] { new OfficeParser(), new OpenDocumentParser(), new Mp3Parser(), new OOXMLParser() }) { Set mts = p.getSupportedTypes(new ParseContext()); - for (MediaType mt : mts) - { - MediaType canonical = config.getMediaTypeRegistry().normalize(mt); - mimeTypes.add( canonical.toString() ); + for (MediaType mt : mts) { + mimeTypes.add(mt.toString()); } } - // Check Tika handles it properly - AutoDetectParser p = new AutoDetectParser(); - Set amts = new HashSet(); - for (MediaType mt : p.getSupportedTypes(new ParseContext())) - { - amts.add(mt.toString()); - } - for (String mimetype : mimeTypes) - { - assertTrue( - "Tika doesn't support expected mimetype: " + mimetype, - amts.contains(mimetype) - ); - } - - // Now check the extractor does too for (String mimetype : mimeTypes) { boolean supports = extracter.isSupported(mimetype); @@ -228,8 +210,7 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest assertEquals("8 8 8", p.get("Data BitsPerSample")); assertEquals("none", p.get("Transparency Alpha")); - //p = openAndCheck(".bmp", "image/bmp"); // TODO Fixed in Swift, - p = openAndCheck(".bmp", "image/x-ms-bmp"); // TODO Pre-swift workaround + p = openAndCheck(".bmp", "image/bmp"); assertEquals("409", p.get("width")); assertEquals("92", p.get("height")); assertEquals("8 8 8", p.get("Data BitsPerSample")); @@ -284,19 +265,22 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest return file; } private Map openAndCheck(String fileBase, String expMimeType) throws Throwable { - // Cheat and ask Tika for the mime type! + // Get the mimetype via the MimeTypeMap + // (Uses Tika internally for the detection) File file = open(fileBase); - AutoDetectParser ap = new AutoDetectParser(); - Metadata metadata = new Metadata(); - metadata.set(Metadata.RESOURCE_NAME_KEY, "quick"+fileBase); - MediaType mt = ap.getDetector().detect( - new BufferedInputStream(new FileInputStream(file)), metadata); - String mimetype = mt.toString(); + ContentReader detectReader = new FileContentReader(file); + String mimetype = mimetypeMap.guessMimetype(fileBase, detectReader); assertEquals("Wrong mimetype for " + fileBase, mimetype, expMimeType); + // Ensure the Tika Auto parser actually handles this + assertTrue("Mimetype should be supported but isn't: " + mimetype, extracter.isSupported(mimetype)); + + // Now create our proper reader ContentReader sourceReader = new FileContentReader(file); sourceReader.setMimetype(mimetype); + + // And finally do the properties extraction return extracter.extractRaw(sourceReader); } }