Tika for metadata extraction

Convert some more metadata extractors to using Tika, and enable the use of the Tika auto-detection parser on any documents without an explicitly defined extractor. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20667 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-07-31 17:39:05 +00:00 · 2010-06-16 14:09:46 +00:00
parent b08d9ff412
commit 0e19812dbc
11 changed files with 354 additions and 184 deletions
--- a/config/alfresco/content-services-context.xml
+++ b/config/alfresco/content-services-context.xml
@@ -200,13 +200,15 @@
   </bean>

   <!-- Content Metadata Extractors -->
+   <!-- The last one listed for any mimetype will be used if available -->
+   <!-- As such, the Tika auto-detect fallback should be listed first -->
+   <bean id="extracter.TikaAuto"       class="org.alfresco.repo.content.metadata.TikaAutoMetadataExtracter"     parent="baseMetadataExtracter" />
   <bean id="extracter.PDFBox"        class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter"        parent="baseMetadataExtracter" />
   <bean id="extracter.Poi"           class="org.alfresco.repo.content.metadata.PoiMetadataExtracter"           parent="baseMetadataExtracter" />
   <bean id="extracter.Office"        class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter"        parent="baseMetadataExtracter" />
   <bean id="extracter.Mail"          class="org.alfresco.repo.content.metadata.MailMetadataExtracter"          parent="baseMetadataExtracter" />
   <bean id="extracter.Html"          class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter"          parent="baseMetadataExtracter" />
-   <!--  Unsupported experimental extractor commented out -->
-   <!-- <bean id="extracter.MP3"           class="org.alfresco.repo.content.metadata.MP3MetadataExtracter"           parent="baseMetadataExtracter" /> -->
+   <bean id="extracter.MP3"           class="org.alfresco.repo.content.metadata.MP3MetadataExtracter"           parent="baseMetadataExtracter" />
   <bean id="extracter.OpenDocument"  class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter"  parent="baseMetadataExtracter" />
   <bean id="extracter.DWG"           class="org.alfresco.repo.content.metadata.DWGMetadataExtracter"           parent="baseMetadataExtracter" />
   <bean id="extracter.RFC822"        class="org.alfresco.repo.content.metadata.RFC822MetadataExtracter"        parent="baseMetadataExtracter" >
--- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
@@ -199,6 +199,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    }
    
    /**
+     * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
     * @return      Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
     * 
     * @see #isSupported(String)
@@ -209,10 +210,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    }

    /**
-     * Set the policy to use when existing values are encountered.  Depending on how the extracer
+     * Set the policy to use when existing values are encountered.  Depending on how the extractor
     * is called, this may not be relevant, i.e an empty map of existing properties may be passed
     * in by the client code, which may follow its own overwrite strategy.
     * 
+     * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
     * @param overwritePolicy       the policy to apply when there are existing system properties
     */
    public void setOverwritePolicy(OverwritePolicy overwritePolicy)
@@ -221,10 +223,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    }

    /**
-     * Set the policy to use when existing values are encountered.  Depending on how the extracer
+     * Set the policy to use when existing values are encountered.  Depending on how the extractor
     * is called, this may not be relevant, i.e an empty map of existing properties may be passed
     * in by the client code, which may follow its own overwrite strategy.
     * 
+     * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
     * @param overwritePolicyStr    the policy to apply when there are existing system properties
     */
    public void setOverwritePolicy(String overwritePolicyStr)
--- a/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java
@@ -18,23 +18,15 @@
 */
 package org.alfresco.repo.content.metadata;

-import java.io.File;
 import java.io.Serializable;
-import java.util.Arrays;
-import java.util.HashSet;
+import java.util.ArrayList;
 import java.util.Map;

 import org.alfresco.repo.content.MimetypeMap;
-import org.alfresco.service.cmr.repository.ContentReader;
-import org.alfresco.util.TempFileProvider;
-import org.farng.mp3.AbstractMP3FragmentBody;
-import org.farng.mp3.MP3File;
-import org.farng.mp3.id3.AbstractID3v2;
-import org.farng.mp3.id3.AbstractID3v2Frame;
-import org.farng.mp3.id3.ID3v1;
-import org.farng.mp3.lyrics3.AbstractLyrics3;
-import org.farng.mp3.lyrics3.Lyrics3v2;
-import org.farng.mp3.lyrics3.Lyrics3v2Field;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.mp3.Mp3Parser;

 /**
 * Extracts the following values from MP3 files:
@@ -51,13 +43,15 @@ import org.farng.mp3.lyrics3.Lyrics3v2Field;
 *   <b>lyrics:</b>                 --      {music}lyrics
 * </pre>
 * 
- * TIKA Note - title and author go in metadata, but much of the
- *  rest is only in the text. Some of the ID3v2 parts 
- *  (composer, lyrics) are not yet implemented.
+ * TODO Get hold of a mp3 file with some lyrics in it, so we
+ *  can contribute the patch to Tika
 * 
+ * Uses Apache Tika
+ * 
+ * @author Nick Burch
 * @author Roy Wetherall
 */
-public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
+public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter
 {
    private static final String KEY_SONG_TITLE = "songTitle";
    private static final String KEY_ALBUM_TITLE = "albumTitle";
@@ -70,110 +64,39 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
    private static final String KEY_COMPOSER = "composer";
    private static final String KEY_LYRICS = "lyrics";

-    public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_MP3 };
+    public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
+          new String[] { MimetypeMap.MIMETYPE_MP3 },
+          new Mp3Parser()
+    );
    
    public MP3MetadataExtracter()
    {
-        super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
+        super(SUPPORTED_MIMETYPES);
    }
    
    @Override
-    public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
-    {
-        Map<String, Serializable> rawProperties = newRawMap();
-        
-        // Create a temp file
-        File tempFile = TempFileProvider.createTempFile("MP3MetadataExtracter_", ".tmp");
-        try
-        {
-            reader.getContent(tempFile);
-            
-            // Create the MP3 object from the file
-            // Open it read only as we won't make any changes
-            MP3File mp3File = new MP3File(tempFile, false);
-            
-            ID3v1 id3v1 = mp3File.getID3v1Tag();
-            if (id3v1 != null)
-            {
-                putRawValue(KEY_ALBUM_TITLE, id3v1.getAlbum(), rawProperties);
-                putRawValue(KEY_SONG_TITLE, id3v1.getTitle(), rawProperties);
-                putRawValue(KEY_ARTIST, id3v1.getArtist(), rawProperties);
-                putRawValue(KEY_COMMENT, id3v1.getComment(), rawProperties);
-                putRawValue(KEY_YEAR_RELEASED, id3v1.getYear(), rawProperties);
-                
-                // TODO sort out the genre
-                //putRawValue(MusicModel.KEY_GENRE, id3v1.getGenre());
-                
-                // TODO sort out the size
-                //putRawValue(MusicModel.KEY_SIZE, id3v1.getSize());            
+    protected Parser getParser() {
+       return new Mp3Parser();
    }

-            AbstractID3v2 id3v2 = mp3File.getID3v2Tag();
-            if (id3v2 != null)
-            {
-                putRawValue(KEY_SONG_TITLE, getID3V2Value(id3v2, "TIT2"), rawProperties);
-                putRawValue(KEY_ARTIST, getID3V2Value(id3v2, "TPE1"), rawProperties);
-                putRawValue(KEY_ALBUM_TITLE, getID3V2Value(id3v2, "TALB"), rawProperties);
-                putRawValue(KEY_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC"), rawProperties);
-                putRawValue(KEY_COMMENT, getID3V2Value(id3v2, "COMM"), rawProperties);
-                putRawValue(KEY_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK"), rawProperties);
-                putRawValue(KEY_GENRE, getID3V2Value(id3v2, "TCON"), rawProperties);
-                putRawValue(KEY_COMPOSER, getID3V2Value(id3v2, "TCOM"), rawProperties);
+    @Override
+    protected Map<String, Serializable> extractSpecific(Metadata metadata,
+         Map<String, Serializable> properties) {
+       putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
+       putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties);
+       putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
+       putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties);
+       putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties);
+       putRawValue(KEY_GENRE, metadata.get(XMPDM.GENRE), properties);
+       putRawValue(KEY_YEAR_RELEASED, metadata.get(XMPDM.RELEASE_DATE), properties);
+       putRawValue(KEY_COMPOSER, metadata.get(XMPDM.COMPOSER), properties);
+       // TODO lyrics
+       //putRawValue(KEY_LYRICS, getLyrics(), properties);
       
-                // TODO sort out the lyrics
-                //System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT"));
-                //System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT"));
-            }
+       putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
       
-            AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag();
-            if (lyrics3Tag != null)
-            {
-                System.out.println("Lyrics3 tag found.");
-                if (lyrics3Tag instanceof Lyrics3v2)
-                {
-                    putRawValue(KEY_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2"), rawProperties);
-                    putRawValue(KEY_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1"), rawProperties);
-                    putRawValue(KEY_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB"), rawProperties);
-                    putRawValue(KEY_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM"), rawProperties);
-                    putRawValue(KEY_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT"), rawProperties);
-                    putRawValue(KEY_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM"), rawProperties);
+       return properties;
    }
-            }
-            
-        }
-        catch(Exception e)
-        {
-           if (logger.isDebugEnabled())
-           {
-               logger.debug(
-                       "MP3 Metadata extraction failed: \n" +
-                       "   Content:   " + reader,
-                       e);
-           }
-           else
-           {
-               logger.warn(
-                       "MP3 Metadata extraction failed (turn on DEBUG for full error): \n" +
-                       "   Content:   " + reader + "\n" +
-                       "   Failure:   " + e.getMessage());
-           }
-
-        }
-        finally
-        {
-            tempFile.delete();
-        }
-        
-        String description = getDescription(rawProperties);
-        if (description != null)
-        {
-            putRawValue(KEY_DESCRIPTION, description, rawProperties);
-        }
-        
-        // Done
-        return rawProperties;
-    }
-    
    
    /**
     * Generate the description
@@ -181,62 +104,27 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
     * @param props     the properties extracted from the file
     * @return          the description
     */
-    private String getDescription(Map<String, Serializable> props)
+    private String generateDescription(Metadata metadata)
    {
        StringBuilder result = new StringBuilder();
-        if (props.get(KEY_SONG_TITLE) != null)
+        if (metadata.get(Metadata.TITLE) != null)
        {
-            result.append(props.get(KEY_SONG_TITLE));
-            if (props.get(KEY_ALBUM_TITLE) != null)
+            result.append(metadata.get(Metadata.TITLE));
+            if (metadata.get(XMPDM.ALBUM) != null)
            {
               result
                .append(" - ")
-                .append(props.get(KEY_ALBUM_TITLE));
+                .append(metadata.get(XMPDM.ALBUM));
            }
-            if (props.get(KEY_ARTIST) != null)
+            if (metadata.get(XMPDM.ARTIST) != null)
            {
               result
                .append(" (")
-                .append(props.get(KEY_ARTIST))
+                .append(metadata.get(XMPDM.ARTIST))
                .append(")");
            }
        }
        
        return result.toString();
    }
-
-    private String getLyrics3v2Value(Lyrics3v2 lyrics3Tag, String name) 
-    {
-        String result = "";
-        Lyrics3v2Field field = lyrics3Tag.getField(name);
-        if (field != null)
-        {
-            AbstractMP3FragmentBody body = field.getBody();
-            if (body != null)
-            {
-                result = (String)body.getObject("Text");                
-            }
-        }
-        return result;
-    }
-
-    /**
-     * Get the ID3V2 tag value in a safe way
-     */
-    private String getID3V2Value(AbstractID3v2 id3v2, String name)
-    {
-        String result = "";
-        
-        AbstractID3v2Frame frame = id3v2.getFrame(name);
-        if (frame != null)
-        {
-            AbstractMP3FragmentBody body = frame.getBody();
-            if (body != null)
-            {
-                result = (String)body.getObject("Text");                
-            }
-        }
-        
-        return result;
-    }
 }
--- a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java
@@ -29,6 +29,9 @@ import java.util.Map;
 import org.alfresco.repo.content.MimetypeMap;
 import org.alfresco.service.cmr.repository.ContentReader;
 import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+//import org.apache.tika.parser.microsoft.OutlookExtractor; // TODO fix import

 /**
 * Outlook MAPI format email meta-data extractor extracting the following values:
@@ -64,9 +67,23 @@ public class MailMetadataExtracter extends TikaPoweredMetadataExtracter
        super(SUPPORTED_MIMETYPES);
    }
    
+    @Override
+    protected Parser getParser() {
+       //return new OutlookExtractor(); // TODO fix import
+       return null;
+    }
+    
+    @Override
+    protected Map<String, Serializable> extractSpecific(Metadata metadata,
+         Map<String, Serializable> properties) {
+       // TODO move things from extractRaw to here
+       return properties;
+    }
+
    @Override
    public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
    {
+        // TODO remove this in favour of extractSpecific
        final Map<String, Serializable> rawProperties = newRawMap();
        
        InputStream is = null;
--- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java
@@ -18,12 +18,8 @@
 */
 package org.alfresco.repo.content.metadata;

-import java.io.IOException;
-import java.io.InputStream;
 import java.io.Serializable;
 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
 import java.util.Map;

 import org.alfresco.repo.content.MimetypeMap;
--- a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java
@@ -39,6 +39,7 @@ import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
 * 
 * Uses Apache Tika
 * 
+ * @author Nick Burch
 * @author Neil McErlean
 */
 public class PoiMetadataExtracter extends TikaPoweredMetadataExtracter
--- a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.metadata;
+
+import java.util.ArrayList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+
+/**
+ * A Metadata Extractor which makes use of the Apache
+ *  Tika auto-detection to select the best parser
+ *  to extract the metadata from your document.
+ * This will be used for all files which Tika can
+ *  handle, but where no other more explicit
+ *  extractor is defined. 
+
+ * <pre>
+ *   <b>author:</b>                 --      cm:author
+ *   <b>title:</b>                  --      cm:title
+ *   <b>subject:</b>                --      cm:description
+ *   <b>created:</b>                --      cm:created
+ *   <b>comments:</b>
+ * </pre>
+ * 
+ * @author Nick Burch
+ */
+public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter
+{
+    protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class);
+
+    public static ArrayList<String> SUPPORTED_MIMETYPES;
+    static {
+       SUPPORTED_MIMETYPES = new ArrayList<String>();
+       AutoDetectParser p = new AutoDetectParser();
+       for(MediaType mt : p.getParsers().keySet()) {
+          SUPPORTED_MIMETYPES.add( mt.toString() );
+       }
+    }
+    
+    public TikaAutoMetadataExtracter()
+    {
+       super(SUPPORTED_MIMETYPES);
+    }
+    
+    /**
+     * Does auto-detection to select the best Tika
+     *  Parser.
+     */
+    @Override
+    protected Parser getParser() {
+       return new AutoDetectParser();
+    }
+}
--- a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.properties
+++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.properties
@@ -0,0 +1,18 @@
+#
+# TikaAutoMetadataExtracter - default mapping
+#
+# This is used to map from the Tika and standard namespaces
+#  onto your content model. This will be used for any
+#  content for which an explicit extractor isn't defined,
+#  by using Tika's auto-selection facilities.
+#
+# author: Nick Burch
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+
+# Mappings
+author=cm:author
+title=cm:title
+description=cm:description
+created=cm:created
--- a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java
+++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content.metadata;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.Serializable;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.alfresco.model.ContentModel;
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
+import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
+import org.alfresco.service.namespace.QName;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.dwg.DWGParser;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+import org.apache.tika.parser.mp3.Mp3Parser;
+import org.apache.tika.parser.odf.OpenDocumentParser;
+
+
+/**
+ * @see TikaAutoMetadataExtracter
+ * 
+ * @author Nick Burch
+ */
+public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
+{
+    private TikaAutoMetadataExtracter extracter;
+    private static final QName TIKA_MIMETYPE_TEST_PROPERTY =
+       QName.createQName("TikaMimeTypeTestProp");
+
+    @Override
+    public void setUp() throws Exception
+    {
+        super.setUp();
+        extracter = new TikaAutoMetadataExtracter();
+        extracter.setDictionaryService(dictionaryService);
+        extracter.register();
+        
+        // Attach some extra mappings, using the Tika
+        //  metadata keys namespace
+        // These will be tested later
+        HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
+              extracter.getMapping()
+        );
+        
+        Set<QName> tlaSet = new HashSet<QName>();
+        tlaSet.add(TIKA_MIMETYPE_TEST_PROPERTY);
+        newMap.put( Metadata.CONTENT_TYPE, tlaSet );
+        
+        extracter.setMapping(newMap);
+    }
+
+    /**
+     * @return Returns the same transformer regardless - it is allowed
+     */
+    protected MetadataExtracter getExtracter()
+    {
+        return extracter;
+    }
+
+    public void testSupports() throws Exception
+    {
+        ArrayList<String> mimeTypes = new ArrayList<String>();
+        for (Parser p : new Parser[] {
+                 new OfficeParser(), new OpenDocumentParser(),
+                 new Mp3Parser(), new OOXMLParser()
+        }) {
+           Set<MediaType> mts = p.getSupportedTypes(new ParseContext());
+           for (MediaType mt : mts) {
+              mimeTypes.add(mt.toString());
+           }
+        }
+        
+        for (String mimetype : mimeTypes)
+        {
+            boolean supports = extracter.isSupported(mimetype);
+            assertTrue("Mimetype should be supported: " + mimetype, supports);
+        }
+    }
+
+    /**
+     * Test several different files
+     * Note - doesn't use extractFromMimetype
+     */
+    public void testSupportedMimetypes() throws Exception
+    {
+        String[] testFiles = new String[] {
+              ".doc", ".docx", ".xls", ".xlsx",
+              ".ppt", ".pptx", 
+              //".vsd", // Not auto-detected properly yet
+              //"2010.dwg", // Not auto-detected properly yet
+              ".pdf",
+              ".odt"
+        };
+           
+        for (String fileBase : testFiles)
+        {
+           String filename = "quick" + fileBase;
+           URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
+           File file = new File(url.getFile());
+           
+           // Cheat and ask Tika for the mime type!
+           AutoDetectParser ap = new AutoDetectParser();
+           Metadata metadata = new Metadata();
+           metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+           MediaType mt = ap.getDetector().detect(
+                 new BufferedInputStream(new FileInputStream(file)), metadata);
+           String mimetype = mt.toString();
+
+           // Have it processed
+           Map<QName, Serializable> properties = extractFromFile(file, mimetype);
+           
+           // check we got something
+           assertFalse("extractFromMimetype should return at least some properties, " +
+           		"none found for " + mimetype + " - " + filename,
+              properties.isEmpty());
+           
+           // check common metadata
+           testCommonMetadata(mimetype, properties);
+           // check file-type specific metadata
+           testFileSpecificMetadata(mimetype, properties);
+        }
+    }
+    
+    @Override
+    protected boolean skipAuthorCheck(String mimetype) { return true; }
+
+   /**
+    * We also provide the creation date - check that
+    */
+   protected void testFileSpecificMetadata(String mimetype,
+         Map<QName, Serializable> properties) {
+      
+      // Check for extra fields
+      // Author isn't there for the OpenDocument ones
+      if(mimetype.indexOf(".oasis.") == -1) {
+         assertEquals(
+               "Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
+               "Nevin Nollop",
+               DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
+      }
+      
+      // Ensure that we can also get things which are standard
+      //  Tika metadata properties, if we so choose to
+      assertTrue( 
+            "Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype,
+            properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY)
+      );
+      // TODO - uncomment this when TIKA-391 is properly fixed
+//      assertEquals(
+//            "Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
+//            mimetype,
+//            DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
+   }
+    
+}
--- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java
@@ -35,6 +35,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
@@ -136,14 +137,13 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
    }
    
    /**
-     * Does auto-detection to select the best Tika
-     *  Parser.
-     * Implementations can override this if they
-     *  know their specific implementations.
+     * Returns the correct Tika Parser to process
+     *  the document.
+     * If you don't know which you want, use
+     *  {@link TikaAutoMetadataExtracter} which
+     *  makes use of the Tika auto-detection.
     */
-    protected Parser getParser() {
-       return null;
-    }
+    protected abstract Parser getParser();
    
    /**
     * Allows implementation specific mappings
--- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.properties
+++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.properties
@@ -1,13 +0,0 @@
-#
-# TikaPoweredMetadataExtracter - default mapping
-#
-# author: Nick Burch
-
-# Namespaces
-namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
-
-# Mappings
-author=cm:author
-title=cm:title
-description=cm:description
-created=cm:created