Tika for metadata extraction

Convert some more metadata extractors to using Tika, and enable the use of the Tika auto-detection parser on any documents without an explicitly defined extractor. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20667 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-07-31 17:39:05 +00:00 · 2010-06-16 14:09:46 +00:00
parent b08d9ff412
commit 0e19812dbc
11 changed files with 354 additions and 184 deletions
--- a/config/alfresco/content-services-context.xml
+++ b/config/alfresco/content-services-context.xml
@@ -200,13 +200,15 @@
   </bean>
   <!-- Content Metadata Extractors -->
   <!-- The last one listed for any mimetype will be used if available -->
   <!-- As such, the Tika auto-detect fallback should be listed first -->
   <bean id="extracter.TikaAuto"       class="org.alfresco.repo.content.metadata.TikaAutoMetadataExtracter"     parent="baseMetadataExtracter" />
   <bean id="extracter.PDFBox"        class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter"        parent="baseMetadataExtracter" />
   <bean id="extracter.Poi"           class="org.alfresco.repo.content.metadata.PoiMetadataExtracter"           parent="baseMetadataExtracter" />
   <bean id="extracter.Office"        class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter"        parent="baseMetadataExtracter" />
   <bean id="extracter.Mail"          class="org.alfresco.repo.content.metadata.MailMetadataExtracter"          parent="baseMetadataExtracter" />
   <bean id="extracter.Html"          class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter"          parent="baseMetadataExtracter" />
-   <!--  Unsupported experimental extractor commented out -->
+   <bean id="extracter.MP3"           class="org.alfresco.repo.content.metadata.MP3MetadataExtracter"           parent="baseMetadataExtracter" />
   <!-- <bean id="extracter.MP3"           class="org.alfresco.repo.content.metadata.MP3MetadataExtracter"           parent="baseMetadataExtracter" /> -->
   <bean id="extracter.OpenDocument"  class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter"  parent="baseMetadataExtracter" />
   <bean id="extracter.DWG"           class="org.alfresco.repo.content.metadata.DWGMetadataExtracter"           parent="baseMetadataExtracter" />
   <bean id="extracter.RFC822"        class="org.alfresco.repo.content.metadata.RFC822MetadataExtracter"        parent="baseMetadataExtracter" >
--- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
@@ -199,6 +199,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    }
    /**
     * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
     * @return      Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
     * 
     * @see #isSupported(String)
@@ -209,10 +210,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    }
    /**
-     * Set the policy to use when existing values are encountered.  Depending on how the extracer
+     * Set the policy to use when existing values are encountered.  Depending on how the extractor
     * is called, this may not be relevant, i.e an empty map of existing properties may be passed
     * in by the client code, which may follow its own overwrite strategy.
     * 
     * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
     * @param overwritePolicy       the policy to apply when there are existing system properties
     */
    public void setOverwritePolicy(OverwritePolicy overwritePolicy)
@@ -221,10 +223,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    }
    /**
-     * Set the policy to use when existing values are encountered.  Depending on how the extracer
+     * Set the policy to use when existing values are encountered.  Depending on how the extractor
     * is called, this may not be relevant, i.e an empty map of existing properties may be passed
     * in by the client code, which may follow its own overwrite strategy.
     * 
     * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
     * @param overwritePolicyStr    the policy to apply when there are existing system properties
     */
    public void setOverwritePolicy(String overwritePolicyStr)
--- a/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java
@@ -18,23 +18,15 @@
 */
 package org.alfresco.repo.content.metadata;
 import java.io.File;
 import java.io.Serializable;
-import java.util.Arrays;
+import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Map;
 import org.alfresco.repo.content.MimetypeMap;
-import org.alfresco.service.cmr.repository.ContentReader;
+import org.apache.tika.metadata.Metadata;
-import org.alfresco.util.TempFileProvider;
+import org.apache.tika.metadata.XMPDM;
-import org.farng.mp3.AbstractMP3FragmentBody;
+import org.apache.tika.parser.Parser;
-import org.farng.mp3.MP3File;
+import org.apache.tika.parser.mp3.Mp3Parser;
 import org.farng.mp3.id3.AbstractID3v2;
 import org.farng.mp3.id3.AbstractID3v2Frame;
 import org.farng.mp3.id3.ID3v1;
 import org.farng.mp3.lyrics3.AbstractLyrics3;
 import org.farng.mp3.lyrics3.Lyrics3v2;
 import org.farng.mp3.lyrics3.Lyrics3v2Field;
 /**
 * Extracts the following values from MP3 files:
@@ -51,13 +43,15 @@ import org.farng.mp3.lyrics3.Lyrics3v2Field;
 *   <b>lyrics:</b>                 --      {music}lyrics
 * </pre>
 * 
- * TIKA Note - title and author go in metadata, but much of the
+ * TODO Get hold of a mp3 file with some lyrics in it, so we
- *  rest is only in the text. Some of the ID3v2 parts 
+ *  can contribute the patch to Tika
 *  (composer, lyrics) are not yet implemented.
 * 
 * Uses Apache Tika
 * 
 * @author Nick Burch
 * @author Roy Wetherall
 */
-public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
+public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter
 {
    private static final String KEY_SONG_TITLE = "songTitle";
    private static final String KEY_ALBUM_TITLE = "albumTitle";
@@ -70,173 +64,67 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
    private static final String KEY_COMPOSER = "composer";
    private static final String KEY_LYRICS = "lyrics";
-    public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_MP3 };
+    public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
          new String[] { MimetypeMap.MIMETYPE_MP3 },
          new Mp3Parser()
    );
    public MP3MetadataExtracter()
    {
-        super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
+        super(SUPPORTED_MIMETYPES);
    }
    @Override
    protected Parser getParser() {
       return new Mp3Parser();
    }
    @Override
-    public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
+    protected Map<String, Serializable> extractSpecific(Metadata metadata,
-    {
+         Map<String, Serializable> properties) {
-        Map<String, Serializable> rawProperties = newRawMap();
+       putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
-        
+       putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties);
-        // Create a temp file
+       putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
-        File tempFile = TempFileProvider.createTempFile("MP3MetadataExtracter_", ".tmp");
+       putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties);
-        try
+       putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties);
-        {
+       putRawValue(KEY_GENRE, metadata.get(XMPDM.GENRE), properties);
-            reader.getContent(tempFile);
+       putRawValue(KEY_YEAR_RELEASED, metadata.get(XMPDM.RELEASE_DATE), properties);
-            
+       putRawValue(KEY_COMPOSER, metadata.get(XMPDM.COMPOSER), properties);
-            // Create the MP3 object from the file
+       // TODO lyrics
-            // Open it read only as we won't make any changes
+       //putRawValue(KEY_LYRICS, getLyrics(), properties);
-            MP3File mp3File = new MP3File(tempFile, false);
+       
-            
+       putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
-            ID3v1 id3v1 = mp3File.getID3v1Tag();
+       
-            if (id3v1 != null)
+       return properties;
            {
                putRawValue(KEY_ALBUM_TITLE, id3v1.getAlbum(), rawProperties);
                putRawValue(KEY_SONG_TITLE, id3v1.getTitle(), rawProperties);
                putRawValue(KEY_ARTIST, id3v1.getArtist(), rawProperties);
                putRawValue(KEY_COMMENT, id3v1.getComment(), rawProperties);
                putRawValue(KEY_YEAR_RELEASED, id3v1.getYear(), rawProperties);
                // TODO sort out the genre
                //putRawValue(MusicModel.KEY_GENRE, id3v1.getGenre());
                // TODO sort out the size
                //putRawValue(MusicModel.KEY_SIZE, id3v1.getSize());            
            }
            AbstractID3v2 id3v2 = mp3File.getID3v2Tag();
            if (id3v2 != null)
            {
                putRawValue(KEY_SONG_TITLE, getID3V2Value(id3v2, "TIT2"), rawProperties);
                putRawValue(KEY_ARTIST, getID3V2Value(id3v2, "TPE1"), rawProperties);
                putRawValue(KEY_ALBUM_TITLE, getID3V2Value(id3v2, "TALB"), rawProperties);
                putRawValue(KEY_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC"), rawProperties);
                putRawValue(KEY_COMMENT, getID3V2Value(id3v2, "COMM"), rawProperties);
                putRawValue(KEY_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK"), rawProperties);
                putRawValue(KEY_GENRE, getID3V2Value(id3v2, "TCON"), rawProperties);
                putRawValue(KEY_COMPOSER, getID3V2Value(id3v2, "TCOM"), rawProperties);
                // TODO sort out the lyrics
                //System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT"));
                //System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT"));
            }
            AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag();
            if (lyrics3Tag != null)
            {
                System.out.println("Lyrics3 tag found.");
                if (lyrics3Tag instanceof Lyrics3v2)
                {
                    putRawValue(KEY_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2"), rawProperties);
                    putRawValue(KEY_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1"), rawProperties);
                    putRawValue(KEY_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB"), rawProperties);
                    putRawValue(KEY_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM"), rawProperties);
                    putRawValue(KEY_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT"), rawProperties);
                    putRawValue(KEY_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM"), rawProperties);
                }
            }
        }
        catch(Exception e)
        {
           if (logger.isDebugEnabled())
           {
               logger.debug(
                       "MP3 Metadata extraction failed: \n" +
                       "   Content:   " + reader,
                       e);
           }
           else
           {
               logger.warn(
                       "MP3 Metadata extraction failed (turn on DEBUG for full error): \n" +
                       "   Content:   " + reader + "\n" +
                       "   Failure:   " + e.getMessage());
           }
        }
        finally
        {
            tempFile.delete();
        }
        String description = getDescription(rawProperties);
        if (description != null)
        {
            putRawValue(KEY_DESCRIPTION, description, rawProperties);
        }
        // Done
        return rawProperties;
    }
    /**
     * Generate the description
     * 
     * @param props     the properties extracted from the file
     * @return          the description
     */
-    private String getDescription(Map<String, Serializable> props)
+    private String generateDescription(Metadata metadata)
    {
        StringBuilder result = new StringBuilder();
-        if (props.get(KEY_SONG_TITLE) != null)
+        if (metadata.get(Metadata.TITLE) != null)
        {
-            result.append(props.get(KEY_SONG_TITLE));
+            result.append(metadata.get(Metadata.TITLE));
-            if (props.get(KEY_ALBUM_TITLE) != null)
+            if (metadata.get(XMPDM.ALBUM) != null)
            {
               result
                .append(" - ")
-                .append(props.get(KEY_ALBUM_TITLE));
+                .append(metadata.get(XMPDM.ALBUM));
            }
-            if (props.get(KEY_ARTIST) != null)
+            if (metadata.get(XMPDM.ARTIST) != null)
            {
               result
                .append(" (")
-                .append(props.get(KEY_ARTIST))
+                .append(metadata.get(XMPDM.ARTIST))
                .append(")");
            }
        }
        return result.toString();
    }
    private String getLyrics3v2Value(Lyrics3v2 lyrics3Tag, String name) 
    {
        String result = "";
        Lyrics3v2Field field = lyrics3Tag.getField(name);
        if (field != null)
        {
            AbstractMP3FragmentBody body = field.getBody();
            if (body != null)
            {
                result = (String)body.getObject("Text");                
            }
        }
        return result;
    }
    /**
     * Get the ID3V2 tag value in a safe way
     */
    private String getID3V2Value(AbstractID3v2 id3v2, String name)
    {
        String result = "";
        AbstractID3v2Frame frame = id3v2.getFrame(name);
        if (frame != null)
        {
            AbstractMP3FragmentBody body = frame.getBody();
            if (body != null)
            {
                result = (String)body.getObject("Text");                
            }
        }
        return result;
    }
 }
--- a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java
@@ -29,6 +29,9 @@ import java.util.Map;
 import org.alfresco.repo.content.MimetypeMap;
 import org.alfresco.service.cmr.repository.ContentReader;
 import org.apache.poi.hsmf.MAPIMessage;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
 //import org.apache.tika.parser.microsoft.OutlookExtractor; // TODO fix import
 /**
 * Outlook MAPI format email meta-data extractor extracting the following values:
@@ -63,10 +66,24 @@ public class MailMetadataExtracter extends TikaPoweredMetadataExtracter
    {
        super(SUPPORTED_MIMETYPES);
    }
    @Override
    protected Parser getParser() {
       //return new OutlookExtractor(); // TODO fix import
       return null;
    }
    @Override
    protected Map<String, Serializable> extractSpecific(Metadata metadata,
         Map<String, Serializable> properties) {
       // TODO move things from extractRaw to here
       return properties;
    }
    @Override
    public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
    {
        // TODO remove this in favour of extractSpecific
        final Map<String, Serializable> rawProperties = newRawMap();
        InputStream is = null;
--- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java
@@ -18,12 +18,8 @@
 */
 package org.alfresco.repo.content.metadata;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Map;
 import org.alfresco.repo.content.MimetypeMap;
--- a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java
@@ -39,6 +39,7 @@ import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
 * 
 * Uses Apache Tika
 * 
 * @author Nick Burch
 * @author Neil McErlean
 */
 public class PoiMetadataExtracter extends TikaPoweredMetadataExtracter
--- a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java
@@ -0,0 +1,73 @@
 /*
 * Copyright (C) 2005-2010 Alfresco Software Limited.
 *
 * This file is part of Alfresco
 *
 * Alfresco is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Alfresco is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 */
 package org.alfresco.repo.content.metadata;
 import java.util.ArrayList;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 /**
 * A Metadata Extractor which makes use of the Apache
 *  Tika auto-detection to select the best parser
 *  to extract the metadata from your document.
 * This will be used for all files which Tika can
 *  handle, but where no other more explicit
 *  extractor is defined. 
 * <pre>
 *   <b>author:</b>                 --      cm:author
 *   <b>title:</b>                  --      cm:title
 *   <b>subject:</b>                --      cm:description
 *   <b>created:</b>                --      cm:created
 *   <b>comments:</b>
 * </pre>
 * 
 * @author Nick Burch
 */
 public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter
 {
    protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class);
    public static ArrayList<String> SUPPORTED_MIMETYPES;
    static {
       SUPPORTED_MIMETYPES = new ArrayList<String>();
       AutoDetectParser p = new AutoDetectParser();
       for(MediaType mt : p.getParsers().keySet()) {
          SUPPORTED_MIMETYPES.add( mt.toString() );
       }
    }
    public TikaAutoMetadataExtracter()
    {
       super(SUPPORTED_MIMETYPES);
    }
    /**
     * Does auto-detection to select the best Tika
     *  Parser.
     */
    @Override
    protected Parser getParser() {
       return new AutoDetectParser();
    }
 }
--- a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.properties
+++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.properties
@@ -0,0 +1,18 @@
 #
 # TikaAutoMetadataExtracter - default mapping
 #
 # This is used to map from the Tika and standard namespaces
 #  onto your content model. This will be used for any
 #  content for which an explicit extractor isn't defined,
 #  by using Tika's auto-selection facilities.
 #
 # author: Nick Burch
 # Namespaces
 namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
 # Mappings
 author=cm:author
 title=cm:title
 description=cm:description
 created=cm:created
--- a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java
+++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java
@@ -0,0 +1,185 @@
 /*
 * Copyright (C) 2005-2010 Alfresco Software Limited.
 *
 * This file is part of Alfresco
 *
 * Alfresco is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Alfresco is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 */
 package org.alfresco.repo.content.metadata;
 import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.Serializable;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
 import org.alfresco.model.ContentModel;
 import org.alfresco.repo.content.MimetypeMap;
 import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
 import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
 import org.alfresco.service.namespace.QName;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.dwg.DWGParser;
 import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
 import org.apache.tika.parser.mp3.Mp3Parser;
 import org.apache.tika.parser.odf.OpenDocumentParser;
 /**
 * @see TikaAutoMetadataExtracter
 * 
 * @author Nick Burch
 */
 public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
 {
    private TikaAutoMetadataExtracter extracter;
    private static final QName TIKA_MIMETYPE_TEST_PROPERTY =
       QName.createQName("TikaMimeTypeTestProp");
    @Override
    public void setUp() throws Exception
    {
        super.setUp();
        extracter = new TikaAutoMetadataExtracter();
        extracter.setDictionaryService(dictionaryService);
        extracter.register();
        // Attach some extra mappings, using the Tika
        //  metadata keys namespace
        // These will be tested later
        HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
              extracter.getMapping()
        );
        Set<QName> tlaSet = new HashSet<QName>();
        tlaSet.add(TIKA_MIMETYPE_TEST_PROPERTY);
        newMap.put( Metadata.CONTENT_TYPE, tlaSet );
        extracter.setMapping(newMap);
    }
    /**
     * @return Returns the same transformer regardless - it is allowed
     */
    protected MetadataExtracter getExtracter()
    {
        return extracter;
    }
    public void testSupports() throws Exception
    {
        ArrayList<String> mimeTypes = new ArrayList<String>();
        for (Parser p : new Parser[] {
                 new OfficeParser(), new OpenDocumentParser(),
                 new Mp3Parser(), new OOXMLParser()
        }) {
           Set<MediaType> mts = p.getSupportedTypes(new ParseContext());
           for (MediaType mt : mts) {
              mimeTypes.add(mt.toString());
           }
        }
        for (String mimetype : mimeTypes)
        {
            boolean supports = extracter.isSupported(mimetype);
            assertTrue("Mimetype should be supported: " + mimetype, supports);
        }
    }
    /**
     * Test several different files
     * Note - doesn't use extractFromMimetype
     */
    public void testSupportedMimetypes() throws Exception
    {
        String[] testFiles = new String[] {
              ".doc", ".docx", ".xls", ".xlsx",
              ".ppt", ".pptx", 
              //".vsd", // Not auto-detected properly yet
              //"2010.dwg", // Not auto-detected properly yet
              ".pdf",
              ".odt"
        };
        for (String fileBase : testFiles)
        {
           String filename = "quick" + fileBase;
           URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
           File file = new File(url.getFile());
           // Cheat and ask Tika for the mime type!
           AutoDetectParser ap = new AutoDetectParser();
           Metadata metadata = new Metadata();
           metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
           MediaType mt = ap.getDetector().detect(
                 new BufferedInputStream(new FileInputStream(file)), metadata);
           String mimetype = mt.toString();
           // Have it processed
           Map<QName, Serializable> properties = extractFromFile(file, mimetype);
           // check we got something
           assertFalse("extractFromMimetype should return at least some properties, " +
           		"none found for " + mimetype + " - " + filename,
              properties.isEmpty());
           // check common metadata
           testCommonMetadata(mimetype, properties);
           // check file-type specific metadata
           testFileSpecificMetadata(mimetype, properties);
        }
    }
    @Override
    protected boolean skipAuthorCheck(String mimetype) { return true; }
   /**
    * We also provide the creation date - check that
    */
   protected void testFileSpecificMetadata(String mimetype,
         Map<QName, Serializable> properties) {
      // Check for extra fields
      // Author isn't there for the OpenDocument ones
      if(mimetype.indexOf(".oasis.") == -1) {
         assertEquals(
               "Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
               "Nevin Nollop",
               DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
      }
      // Ensure that we can also get things which are standard
      //  Tika metadata properties, if we so choose to
      assertTrue( 
            "Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype,
            properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY)
      );
      // TODO - uncomment this when TIKA-391 is properly fixed
 //      assertEquals(
 //            "Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
 //            mimetype,
 //            DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
   }
 }
--- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java
@@ -35,6 +35,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
@@ -136,14 +137,13 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
    }
    /**
-     * Does auto-detection to select the best Tika
+     * Returns the correct Tika Parser to process
-     *  Parser.
+     *  the document.
-     * Implementations can override this if they
+     * If you don't know which you want, use
-     *  know their specific implementations.
+     *  {@link TikaAutoMetadataExtracter} which
     *  makes use of the Tika auto-detection.
     */
-    protected Parser getParser() {
+    protected abstract Parser getParser();
       return null;
    }
    /**
     * Allows implementation specific mappings
--- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.properties
+++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.properties
@@ -1,13 +0,0 @@
 #
 # TikaPoweredMetadataExtracter - default mapping
 #
 # author: Nick Burch
 # Namespaces
 namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
 # Mappings
 author=cm:author
 title=cm:title
 description=cm:description
 created=cm:created