diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml index 9aead1462d..3cc790d736 100644 --- a/config/alfresco/content-services-context.xml +++ b/config/alfresco/content-services-context.xml @@ -200,13 +200,15 @@ + + + - - + diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java index 9e521623bd..118c94132f 100644 --- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java @@ -199,6 +199,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } /** + * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced * @return Returns 1.0 if the mimetype is supported, otherwise 0.0 * * @see #isSupported(String) @@ -209,10 +210,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } /** - * Set the policy to use when existing values are encountered. Depending on how the extracer + * Set the policy to use when existing values are encountered. Depending on how the extractor * is called, this may not be relevant, i.e an empty map of existing properties may be passed * in by the client code, which may follow its own overwrite strategy. * + * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced * @param overwritePolicy the policy to apply when there are existing system properties */ public void setOverwritePolicy(OverwritePolicy overwritePolicy) @@ -221,10 +223,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } /** - * Set the policy to use when existing values are encountered. Depending on how the extracer + * Set the policy to use when existing values are encountered. Depending on how the extractor * is called, this may not be relevant, i.e an empty map of existing properties may be passed * in by the client code, which may follow its own overwrite strategy. * + * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced * @param overwritePolicyStr the policy to apply when there are existing system properties */ public void setOverwritePolicy(String overwritePolicyStr) diff --git a/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java index 3e6708252c..b9921e6244 100644 --- a/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java @@ -18,23 +18,15 @@ */ package org.alfresco.repo.content.metadata; -import java.io.File; import java.io.Serializable; -import java.util.Arrays; -import java.util.HashSet; +import java.util.ArrayList; import java.util.Map; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentReader; -import org.alfresco.util.TempFileProvider; -import org.farng.mp3.AbstractMP3FragmentBody; -import org.farng.mp3.MP3File; -import org.farng.mp3.id3.AbstractID3v2; -import org.farng.mp3.id3.AbstractID3v2Frame; -import org.farng.mp3.id3.ID3v1; -import org.farng.mp3.lyrics3.AbstractLyrics3; -import org.farng.mp3.lyrics3.Lyrics3v2; -import org.farng.mp3.lyrics3.Lyrics3v2Field; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.XMPDM; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.mp3.Mp3Parser; /** * Extracts the following values from MP3 files: @@ -51,13 +43,15 @@ import org.farng.mp3.lyrics3.Lyrics3v2Field; * lyrics: -- {music}lyrics * * - * TIKA Note - title and author go in metadata, but much of the - * rest is only in the text. Some of the ID3v2 parts - * (composer, lyrics) are not yet implemented. + * TODO Get hold of a mp3 file with some lyrics in it, so we + * can contribute the patch to Tika * + * Uses Apache Tika + * + * @author Nick Burch * @author Roy Wetherall */ -public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter +public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter { private static final String KEY_SONG_TITLE = "songTitle"; private static final String KEY_ALBUM_TITLE = "albumTitle"; @@ -70,173 +64,67 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter private static final String KEY_COMPOSER = "composer"; private static final String KEY_LYRICS = "lyrics"; - public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_MP3 }; + public static ArrayList SUPPORTED_MIMETYPES = buildSupportedMimetypes( + new String[] { MimetypeMap.MIMETYPE_MP3 }, + new Mp3Parser() + ); public MP3MetadataExtracter() { - super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES))); + super(SUPPORTED_MIMETYPES); + } + + @Override + protected Parser getParser() { + return new Mp3Parser(); } @Override - public Map extractRaw(ContentReader reader) throws Throwable - { - Map rawProperties = newRawMap(); - - // Create a temp file - File tempFile = TempFileProvider.createTempFile("MP3MetadataExtracter_", ".tmp"); - try - { - reader.getContent(tempFile); - - // Create the MP3 object from the file - // Open it read only as we won't make any changes - MP3File mp3File = new MP3File(tempFile, false); - - ID3v1 id3v1 = mp3File.getID3v1Tag(); - if (id3v1 != null) - { - putRawValue(KEY_ALBUM_TITLE, id3v1.getAlbum(), rawProperties); - putRawValue(KEY_SONG_TITLE, id3v1.getTitle(), rawProperties); - putRawValue(KEY_ARTIST, id3v1.getArtist(), rawProperties); - putRawValue(KEY_COMMENT, id3v1.getComment(), rawProperties); - putRawValue(KEY_YEAR_RELEASED, id3v1.getYear(), rawProperties); - - // TODO sort out the genre - //putRawValue(MusicModel.KEY_GENRE, id3v1.getGenre()); - - // TODO sort out the size - //putRawValue(MusicModel.KEY_SIZE, id3v1.getSize()); - } - - AbstractID3v2 id3v2 = mp3File.getID3v2Tag(); - if (id3v2 != null) - { - putRawValue(KEY_SONG_TITLE, getID3V2Value(id3v2, "TIT2"), rawProperties); - putRawValue(KEY_ARTIST, getID3V2Value(id3v2, "TPE1"), rawProperties); - putRawValue(KEY_ALBUM_TITLE, getID3V2Value(id3v2, "TALB"), rawProperties); - putRawValue(KEY_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC"), rawProperties); - putRawValue(KEY_COMMENT, getID3V2Value(id3v2, "COMM"), rawProperties); - putRawValue(KEY_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK"), rawProperties); - putRawValue(KEY_GENRE, getID3V2Value(id3v2, "TCON"), rawProperties); - putRawValue(KEY_COMPOSER, getID3V2Value(id3v2, "TCOM"), rawProperties); - - // TODO sort out the lyrics - //System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT")); - //System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT")); - } - - AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag(); - if (lyrics3Tag != null) - { - System.out.println("Lyrics3 tag found."); - if (lyrics3Tag instanceof Lyrics3v2) - { - putRawValue(KEY_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2"), rawProperties); - putRawValue(KEY_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1"), rawProperties); - putRawValue(KEY_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB"), rawProperties); - putRawValue(KEY_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM"), rawProperties); - putRawValue(KEY_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT"), rawProperties); - putRawValue(KEY_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM"), rawProperties); - } - } - - } - catch(Exception e) - { - if (logger.isDebugEnabled()) - { - logger.debug( - "MP3 Metadata extraction failed: \n" + - " Content: " + reader, - e); - } - else - { - logger.warn( - "MP3 Metadata extraction failed (turn on DEBUG for full error): \n" + - " Content: " + reader + "\n" + - " Failure: " + e.getMessage()); - } - - } - finally - { - tempFile.delete(); - } - - String description = getDescription(rawProperties); - if (description != null) - { - putRawValue(KEY_DESCRIPTION, description, rawProperties); - } - - // Done - return rawProperties; + protected Map extractSpecific(Metadata metadata, + Map properties) { + putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties); + putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties); + putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties); + putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties); + putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties); + putRawValue(KEY_GENRE, metadata.get(XMPDM.GENRE), properties); + putRawValue(KEY_YEAR_RELEASED, metadata.get(XMPDM.RELEASE_DATE), properties); + putRawValue(KEY_COMPOSER, metadata.get(XMPDM.COMPOSER), properties); + // TODO lyrics + //putRawValue(KEY_LYRICS, getLyrics(), properties); + + putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties); + + return properties; } - /** * Generate the description * * @param props the properties extracted from the file * @return the description */ - private String getDescription(Map props) + private String generateDescription(Metadata metadata) { StringBuilder result = new StringBuilder(); - if (props.get(KEY_SONG_TITLE) != null) + if (metadata.get(Metadata.TITLE) != null) { - result.append(props.get(KEY_SONG_TITLE)); - if (props.get(KEY_ALBUM_TITLE) != null) + result.append(metadata.get(Metadata.TITLE)); + if (metadata.get(XMPDM.ALBUM) != null) { result .append(" - ") - .append(props.get(KEY_ALBUM_TITLE)); + .append(metadata.get(XMPDM.ALBUM)); } - if (props.get(KEY_ARTIST) != null) + if (metadata.get(XMPDM.ARTIST) != null) { result .append(" (") - .append(props.get(KEY_ARTIST)) + .append(metadata.get(XMPDM.ARTIST)) .append(")"); } } return result.toString(); } - - private String getLyrics3v2Value(Lyrics3v2 lyrics3Tag, String name) - { - String result = ""; - Lyrics3v2Field field = lyrics3Tag.getField(name); - if (field != null) - { - AbstractMP3FragmentBody body = field.getBody(); - if (body != null) - { - result = (String)body.getObject("Text"); - } - } - return result; - } - - /** - * Get the ID3V2 tag value in a safe way - */ - private String getID3V2Value(AbstractID3v2 id3v2, String name) - { - String result = ""; - - AbstractID3v2Frame frame = id3v2.getFrame(name); - if (frame != null) - { - AbstractMP3FragmentBody body = frame.getBody(); - if (body != null) - { - result = (String)body.getObject("Text"); - } - } - - return result; - } } diff --git a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java index bc972220cd..a8f9700da4 100644 --- a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java @@ -29,6 +29,9 @@ import java.util.Map; import org.alfresco.repo.content.MimetypeMap; import org.alfresco.service.cmr.repository.ContentReader; import org.apache.poi.hsmf.MAPIMessage; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.Parser; +//import org.apache.tika.parser.microsoft.OutlookExtractor; // TODO fix import /** * Outlook MAPI format email meta-data extractor extracting the following values: @@ -63,10 +66,24 @@ public class MailMetadataExtracter extends TikaPoweredMetadataExtracter { super(SUPPORTED_MIMETYPES); } + + @Override + protected Parser getParser() { + //return new OutlookExtractor(); // TODO fix import + return null; + } + + @Override + protected Map extractSpecific(Metadata metadata, + Map properties) { + // TODO move things from extractRaw to here + return properties; + } @Override public Map extractRaw(ContentReader reader) throws Throwable { + // TODO remove this in favour of extractSpecific final Map rawProperties = newRawMap(); InputStream is = null; diff --git a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java index a85657ac6c..bb88c51376 100644 --- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java @@ -18,12 +18,8 @@ */ package org.alfresco.repo.content.metadata; -import java.io.IOException; -import java.io.InputStream; import java.io.Serializable; import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; import java.util.Map; import org.alfresco.repo.content.MimetypeMap; diff --git a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java index 965bd9d09c..656000b60e 100644 --- a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java @@ -39,6 +39,7 @@ import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; * * Uses Apache Tika * + * @author Nick Burch * @author Neil McErlean */ public class PoiMetadataExtracter extends TikaPoweredMetadataExtracter diff --git a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java new file mode 100644 index 0000000000..021889b730 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.java @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.metadata; + +import java.util.ArrayList; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; + +/** + * A Metadata Extractor which makes use of the Apache + * Tika auto-detection to select the best parser + * to extract the metadata from your document. + * This will be used for all files which Tika can + * handle, but where no other more explicit + * extractor is defined. + + *
+ *   author:                 --      cm:author
+ *   title:                  --      cm:title
+ *   subject:                --      cm:description
+ *   created:                --      cm:created
+ *   comments:
+ * 
+ * + * @author Nick Burch + */ +public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter +{ + protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class); + + public static ArrayList SUPPORTED_MIMETYPES; + static { + SUPPORTED_MIMETYPES = new ArrayList(); + AutoDetectParser p = new AutoDetectParser(); + for(MediaType mt : p.getParsers().keySet()) { + SUPPORTED_MIMETYPES.add( mt.toString() ); + } + } + + public TikaAutoMetadataExtracter() + { + super(SUPPORTED_MIMETYPES); + } + + /** + * Does auto-detection to select the best Tika + * Parser. + */ + @Override + protected Parser getParser() { + return new AutoDetectParser(); + } +} diff --git a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.properties b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.properties new file mode 100644 index 0000000000..b0d67029d8 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracter.properties @@ -0,0 +1,18 @@ +# +# TikaAutoMetadataExtracter - default mapping +# +# This is used to map from the Tika and standard namespaces +# onto your content model. This will be used for any +# content for which an explicit extractor isn't defined, +# by using Tika's auto-selection facilities. +# +# author: Nick Burch + +# Namespaces +namespace.prefix.cm=http://www.alfresco.org/model/content/1.0 + +# Mappings +author=cm:author +title=cm:title +description=cm:description +created=cm:created diff --git a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java new file mode 100644 index 0000000000..70a19482f5 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.metadata; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.Serializable; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.alfresco.model.ContentModel; +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.repo.content.transform.AbstractContentTransformerTest; +import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; +import org.alfresco.service.namespace.QName; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.dwg.DWGParser; +import org.apache.tika.parser.microsoft.OfficeParser; +import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; +import org.apache.tika.parser.mp3.Mp3Parser; +import org.apache.tika.parser.odf.OpenDocumentParser; + + +/** + * @see TikaAutoMetadataExtracter + * + * @author Nick Burch + */ +public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest +{ + private TikaAutoMetadataExtracter extracter; + private static final QName TIKA_MIMETYPE_TEST_PROPERTY = + QName.createQName("TikaMimeTypeTestProp"); + + @Override + public void setUp() throws Exception + { + super.setUp(); + extracter = new TikaAutoMetadataExtracter(); + extracter.setDictionaryService(dictionaryService); + extracter.register(); + + // Attach some extra mappings, using the Tika + // metadata keys namespace + // These will be tested later + HashMap> newMap = new HashMap>( + extracter.getMapping() + ); + + Set tlaSet = new HashSet(); + tlaSet.add(TIKA_MIMETYPE_TEST_PROPERTY); + newMap.put( Metadata.CONTENT_TYPE, tlaSet ); + + extracter.setMapping(newMap); + } + + /** + * @return Returns the same transformer regardless - it is allowed + */ + protected MetadataExtracter getExtracter() + { + return extracter; + } + + public void testSupports() throws Exception + { + ArrayList mimeTypes = new ArrayList(); + for (Parser p : new Parser[] { + new OfficeParser(), new OpenDocumentParser(), + new Mp3Parser(), new OOXMLParser() + }) { + Set mts = p.getSupportedTypes(new ParseContext()); + for (MediaType mt : mts) { + mimeTypes.add(mt.toString()); + } + } + + for (String mimetype : mimeTypes) + { + boolean supports = extracter.isSupported(mimetype); + assertTrue("Mimetype should be supported: " + mimetype, supports); + } + } + + /** + * Test several different files + * Note - doesn't use extractFromMimetype + */ + public void testSupportedMimetypes() throws Exception + { + String[] testFiles = new String[] { + ".doc", ".docx", ".xls", ".xlsx", + ".ppt", ".pptx", + //".vsd", // Not auto-detected properly yet + //"2010.dwg", // Not auto-detected properly yet + ".pdf", + ".odt" + }; + + for (String fileBase : testFiles) + { + String filename = "quick" + fileBase; + URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename); + File file = new File(url.getFile()); + + // Cheat and ask Tika for the mime type! + AutoDetectParser ap = new AutoDetectParser(); + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, filename); + MediaType mt = ap.getDetector().detect( + new BufferedInputStream(new FileInputStream(file)), metadata); + String mimetype = mt.toString(); + + // Have it processed + Map properties = extractFromFile(file, mimetype); + + // check we got something + assertFalse("extractFromMimetype should return at least some properties, " + + "none found for " + mimetype + " - " + filename, + properties.isEmpty()); + + // check common metadata + testCommonMetadata(mimetype, properties); + // check file-type specific metadata + testFileSpecificMetadata(mimetype, properties); + } + } + + @Override + protected boolean skipAuthorCheck(String mimetype) { return true; } + + /** + * We also provide the creation date - check that + */ + protected void testFileSpecificMetadata(String mimetype, + Map properties) { + + // Check for extra fields + // Author isn't there for the OpenDocument ones + if(mimetype.indexOf(".oasis.") == -1) { + assertEquals( + "Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype, + "Nevin Nollop", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR))); + } + + // Ensure that we can also get things which are standard + // Tika metadata properties, if we so choose to + assertTrue( + "Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype, + properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY) + ); + // TODO - uncomment this when TIKA-391 is properly fixed +// assertEquals( +// "Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype, +// mimetype, +// DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY))); + } + +} diff --git a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java index 04b805ebc6..e412f33abb 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java @@ -35,6 +35,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; @@ -136,14 +137,13 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada } /** - * Does auto-detection to select the best Tika - * Parser. - * Implementations can override this if they - * know their specific implementations. + * Returns the correct Tika Parser to process + * the document. + * If you don't know which you want, use + * {@link TikaAutoMetadataExtracter} which + * makes use of the Tika auto-detection. */ - protected Parser getParser() { - return null; - } + protected abstract Parser getParser(); /** * Allows implementation specific mappings diff --git a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.properties b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.properties deleted file mode 100644 index b0cdc22aa5..0000000000 --- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.properties +++ /dev/null @@ -1,13 +0,0 @@ -# -# TikaPoweredMetadataExtracter - default mapping -# -# author: Nick Burch - -# Namespaces -namespace.prefix.cm=http://www.alfresco.org/model/content/1.0 - -# Mappings -author=cm:author -title=cm:title -description=cm:description -created=cm:created