Tika for metadata extraction

Convert some more metadata extractors to using Tika, and enable the use of 
 the Tika auto-detection parser on any documents without an explicitly
 defined extractor.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20667 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-06-16 14:09:46 +00:00
parent b08d9ff412
commit 0e19812dbc
11 changed files with 354 additions and 184 deletions

View File

@@ -200,13 +200,15 @@
</bean> </bean>
<!-- Content Metadata Extractors --> <!-- Content Metadata Extractors -->
<!-- The last one listed for any mimetype will be used if available -->
<!-- As such, the Tika auto-detect fallback should be listed first -->
<bean id="extracter.TikaAuto" class="org.alfresco.repo.content.metadata.TikaAutoMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.PDFBox" class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" parent="baseMetadataExtracter" /> <bean id="extracter.PDFBox" class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.Poi" class="org.alfresco.repo.content.metadata.PoiMetadataExtracter" parent="baseMetadataExtracter" /> <bean id="extracter.Poi" class="org.alfresco.repo.content.metadata.PoiMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.Office" class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" parent="baseMetadataExtracter" /> <bean id="extracter.Office" class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.Mail" class="org.alfresco.repo.content.metadata.MailMetadataExtracter" parent="baseMetadataExtracter" /> <bean id="extracter.Mail" class="org.alfresco.repo.content.metadata.MailMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.Html" class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" parent="baseMetadataExtracter" /> <bean id="extracter.Html" class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" parent="baseMetadataExtracter" />
<!-- Unsupported experimental extractor commented out --> <bean id="extracter.MP3" class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" parent="baseMetadataExtracter" />
<!-- <bean id="extracter.MP3" class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" parent="baseMetadataExtracter" /> -->
<bean id="extracter.OpenDocument" class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter" parent="baseMetadataExtracter" /> <bean id="extracter.OpenDocument" class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.DWG" class="org.alfresco.repo.content.metadata.DWGMetadataExtracter" parent="baseMetadataExtracter" /> <bean id="extracter.DWG" class="org.alfresco.repo.content.metadata.DWGMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.RFC822" class="org.alfresco.repo.content.metadata.RFC822MetadataExtracter" parent="baseMetadataExtracter" > <bean id="extracter.RFC822" class="org.alfresco.repo.content.metadata.RFC822MetadataExtracter" parent="baseMetadataExtracter" >

View File

@@ -199,6 +199,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
} }
/** /**
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
* @return Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt> * @return Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
* *
* @see #isSupported(String) * @see #isSupported(String)
@@ -209,10 +210,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
} }
/** /**
* Set the policy to use when existing values are encountered. Depending on how the extracer * Set the policy to use when existing values are encountered. Depending on how the extractor
* is called, this may not be relevant, i.e an empty map of existing properties may be passed * is called, this may not be relevant, i.e an empty map of existing properties may be passed
* in by the client code, which may follow its own overwrite strategy. * in by the client code, which may follow its own overwrite strategy.
* *
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
* @param overwritePolicy the policy to apply when there are existing system properties * @param overwritePolicy the policy to apply when there are existing system properties
*/ */
public void setOverwritePolicy(OverwritePolicy overwritePolicy) public void setOverwritePolicy(OverwritePolicy overwritePolicy)
@@ -221,10 +223,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
} }
/** /**
* Set the policy to use when existing values are encountered. Depending on how the extracer * Set the policy to use when existing values are encountered. Depending on how the extractor
* is called, this may not be relevant, i.e an empty map of existing properties may be passed * is called, this may not be relevant, i.e an empty map of existing properties may be passed
* in by the client code, which may follow its own overwrite strategy. * in by the client code, which may follow its own overwrite strategy.
* *
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
* @param overwritePolicyStr the policy to apply when there are existing system properties * @param overwritePolicyStr the policy to apply when there are existing system properties
*/ */
public void setOverwritePolicy(String overwritePolicyStr) public void setOverwritePolicy(String overwritePolicyStr)

View File

@@ -18,23 +18,15 @@
*/ */
package org.alfresco.repo.content.metadata; package org.alfresco.repo.content.metadata;
import java.io.File;
import java.io.Serializable; import java.io.Serializable;
import java.util.Arrays; import java.util.ArrayList;
import java.util.HashSet;
import java.util.Map; import java.util.Map;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader; import org.apache.tika.metadata.Metadata;
import org.alfresco.util.TempFileProvider; import org.apache.tika.metadata.XMPDM;
import org.farng.mp3.AbstractMP3FragmentBody; import org.apache.tika.parser.Parser;
import org.farng.mp3.MP3File; import org.apache.tika.parser.mp3.Mp3Parser;
import org.farng.mp3.id3.AbstractID3v2;
import org.farng.mp3.id3.AbstractID3v2Frame;
import org.farng.mp3.id3.ID3v1;
import org.farng.mp3.lyrics3.AbstractLyrics3;
import org.farng.mp3.lyrics3.Lyrics3v2;
import org.farng.mp3.lyrics3.Lyrics3v2Field;
/** /**
* Extracts the following values from MP3 files: * Extracts the following values from MP3 files:
@@ -51,13 +43,15 @@ import org.farng.mp3.lyrics3.Lyrics3v2Field;
* <b>lyrics:</b> -- {music}lyrics * <b>lyrics:</b> -- {music}lyrics
* </pre> * </pre>
* *
* TIKA Note - title and author go in metadata, but much of the * TODO Get hold of a mp3 file with some lyrics in it, so we
* rest is only in the text. Some of the ID3v2 parts * can contribute the patch to Tika
* (composer, lyrics) are not yet implemented.
* *
* Uses Apache Tika
*
* @author Nick Burch
* @author Roy Wetherall * @author Roy Wetherall
*/ */
public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter
{ {
private static final String KEY_SONG_TITLE = "songTitle"; private static final String KEY_SONG_TITLE = "songTitle";
private static final String KEY_ALBUM_TITLE = "albumTitle"; private static final String KEY_ALBUM_TITLE = "albumTitle";
@@ -70,173 +64,67 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
private static final String KEY_COMPOSER = "composer"; private static final String KEY_COMPOSER = "composer";
private static final String KEY_LYRICS = "lyrics"; private static final String KEY_LYRICS = "lyrics";
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_MP3 }; public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
new String[] { MimetypeMap.MIMETYPE_MP3 },
new Mp3Parser()
);
public MP3MetadataExtracter() public MP3MetadataExtracter()
{ {
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES))); super(SUPPORTED_MIMETYPES);
}
@Override
protected Parser getParser() {
return new Mp3Parser();
} }
@Override @Override
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable protected Map<String, Serializable> extractSpecific(Metadata metadata,
{ Map<String, Serializable> properties) {
Map<String, Serializable> rawProperties = newRawMap(); putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties);
// Create a temp file putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
File tempFile = TempFileProvider.createTempFile("MP3MetadataExtracter_", ".tmp"); putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties);
try putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties);
{ putRawValue(KEY_GENRE, metadata.get(XMPDM.GENRE), properties);
reader.getContent(tempFile); putRawValue(KEY_YEAR_RELEASED, metadata.get(XMPDM.RELEASE_DATE), properties);
putRawValue(KEY_COMPOSER, metadata.get(XMPDM.COMPOSER), properties);
// Create the MP3 object from the file // TODO lyrics
// Open it read only as we won't make any changes //putRawValue(KEY_LYRICS, getLyrics(), properties);
MP3File mp3File = new MP3File(tempFile, false);
putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
ID3v1 id3v1 = mp3File.getID3v1Tag();
if (id3v1 != null) return properties;
{
putRawValue(KEY_ALBUM_TITLE, id3v1.getAlbum(), rawProperties);
putRawValue(KEY_SONG_TITLE, id3v1.getTitle(), rawProperties);
putRawValue(KEY_ARTIST, id3v1.getArtist(), rawProperties);
putRawValue(KEY_COMMENT, id3v1.getComment(), rawProperties);
putRawValue(KEY_YEAR_RELEASED, id3v1.getYear(), rawProperties);
// TODO sort out the genre
//putRawValue(MusicModel.KEY_GENRE, id3v1.getGenre());
// TODO sort out the size
//putRawValue(MusicModel.KEY_SIZE, id3v1.getSize());
}
AbstractID3v2 id3v2 = mp3File.getID3v2Tag();
if (id3v2 != null)
{
putRawValue(KEY_SONG_TITLE, getID3V2Value(id3v2, "TIT2"), rawProperties);
putRawValue(KEY_ARTIST, getID3V2Value(id3v2, "TPE1"), rawProperties);
putRawValue(KEY_ALBUM_TITLE, getID3V2Value(id3v2, "TALB"), rawProperties);
putRawValue(KEY_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC"), rawProperties);
putRawValue(KEY_COMMENT, getID3V2Value(id3v2, "COMM"), rawProperties);
putRawValue(KEY_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK"), rawProperties);
putRawValue(KEY_GENRE, getID3V2Value(id3v2, "TCON"), rawProperties);
putRawValue(KEY_COMPOSER, getID3V2Value(id3v2, "TCOM"), rawProperties);
// TODO sort out the lyrics
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT"));
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT"));
}
AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag();
if (lyrics3Tag != null)
{
System.out.println("Lyrics3 tag found.");
if (lyrics3Tag instanceof Lyrics3v2)
{
putRawValue(KEY_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2"), rawProperties);
putRawValue(KEY_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1"), rawProperties);
putRawValue(KEY_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB"), rawProperties);
putRawValue(KEY_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM"), rawProperties);
putRawValue(KEY_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT"), rawProperties);
putRawValue(KEY_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM"), rawProperties);
}
}
}
catch(Exception e)
{
if (logger.isDebugEnabled())
{
logger.debug(
"MP3 Metadata extraction failed: \n" +
" Content: " + reader,
e);
}
else
{
logger.warn(
"MP3 Metadata extraction failed (turn on DEBUG for full error): \n" +
" Content: " + reader + "\n" +
" Failure: " + e.getMessage());
}
}
finally
{
tempFile.delete();
}
String description = getDescription(rawProperties);
if (description != null)
{
putRawValue(KEY_DESCRIPTION, description, rawProperties);
}
// Done
return rawProperties;
} }
/** /**
* Generate the description * Generate the description
* *
* @param props the properties extracted from the file * @param props the properties extracted from the file
* @return the description * @return the description
*/ */
private String getDescription(Map<String, Serializable> props) private String generateDescription(Metadata metadata)
{ {
StringBuilder result = new StringBuilder(); StringBuilder result = new StringBuilder();
if (props.get(KEY_SONG_TITLE) != null) if (metadata.get(Metadata.TITLE) != null)
{ {
result.append(props.get(KEY_SONG_TITLE)); result.append(metadata.get(Metadata.TITLE));
if (props.get(KEY_ALBUM_TITLE) != null) if (metadata.get(XMPDM.ALBUM) != null)
{ {
result result
.append(" - ") .append(" - ")
.append(props.get(KEY_ALBUM_TITLE)); .append(metadata.get(XMPDM.ALBUM));
} }
if (props.get(KEY_ARTIST) != null) if (metadata.get(XMPDM.ARTIST) != null)
{ {
result result
.append(" (") .append(" (")
.append(props.get(KEY_ARTIST)) .append(metadata.get(XMPDM.ARTIST))
.append(")"); .append(")");
} }
} }
return result.toString(); return result.toString();
} }
private String getLyrics3v2Value(Lyrics3v2 lyrics3Tag, String name)
{
String result = "";
Lyrics3v2Field field = lyrics3Tag.getField(name);
if (field != null)
{
AbstractMP3FragmentBody body = field.getBody();
if (body != null)
{
result = (String)body.getObject("Text");
}
}
return result;
}
/**
* Get the ID3V2 tag value in a safe way
*/
private String getID3V2Value(AbstractID3v2 id3v2, String name)
{
String result = "";
AbstractID3v2Frame frame = id3v2.getFrame(name);
if (frame != null)
{
AbstractMP3FragmentBody body = frame.getBody();
if (body != null)
{
result = (String)body.getObject("Text");
}
}
return result;
}
} }

View File

@@ -29,6 +29,9 @@ import java.util.Map;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
import org.apache.poi.hsmf.MAPIMessage; import org.apache.poi.hsmf.MAPIMessage;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
//import org.apache.tika.parser.microsoft.OutlookExtractor; // TODO fix import
/** /**
* Outlook MAPI format email meta-data extractor extracting the following values: * Outlook MAPI format email meta-data extractor extracting the following values:
@@ -63,10 +66,24 @@ public class MailMetadataExtracter extends TikaPoweredMetadataExtracter
{ {
super(SUPPORTED_MIMETYPES); super(SUPPORTED_MIMETYPES);
} }
@Override
protected Parser getParser() {
//return new OutlookExtractor(); // TODO fix import
return null;
}
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
Map<String, Serializable> properties) {
// TODO move things from extractRaw to here
return properties;
}
@Override @Override
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{ {
// TODO remove this in favour of extractSpecific
final Map<String, Serializable> rawProperties = newRawMap(); final Map<String, Serializable> rawProperties = newRawMap();
InputStream is = null; InputStream is = null;

View File

@@ -18,12 +18,8 @@
*/ */
package org.alfresco.repo.content.metadata; package org.alfresco.repo.content.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map; import java.util.Map;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;

View File

@@ -39,6 +39,7 @@ import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
* *
* Uses Apache Tika * Uses Apache Tika
* *
* @author Nick Burch
* @author Neil McErlean * @author Neil McErlean
*/ */
public class PoiMetadataExtracter extends TikaPoweredMetadataExtracter public class PoiMetadataExtracter extends TikaPoweredMetadataExtracter

View File

@@ -0,0 +1,73 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.metadata;
import java.util.ArrayList;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
/**
* A Metadata Extractor which makes use of the Apache
* Tika auto-detection to select the best parser
* to extract the metadata from your document.
* This will be used for all files which Tika can
* handle, but where no other more explicit
* extractor is defined.
* <pre>
* <b>author:</b> -- cm:author
* <b>title:</b> -- cm:title
* <b>subject:</b> -- cm:description
* <b>created:</b> -- cm:created
* <b>comments:</b>
* </pre>
*
* @author Nick Burch
*/
public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter
{
protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class);
public static ArrayList<String> SUPPORTED_MIMETYPES;
static {
SUPPORTED_MIMETYPES = new ArrayList<String>();
AutoDetectParser p = new AutoDetectParser();
for(MediaType mt : p.getParsers().keySet()) {
SUPPORTED_MIMETYPES.add( mt.toString() );
}
}
public TikaAutoMetadataExtracter()
{
super(SUPPORTED_MIMETYPES);
}
/**
* Does auto-detection to select the best Tika
* Parser.
*/
@Override
protected Parser getParser() {
return new AutoDetectParser();
}
}

View File

@@ -0,0 +1,18 @@
#
# TikaAutoMetadataExtracter - default mapping
#
# This is used to map from the Tika and standard namespaces
# onto your content model. This will be used for any
# content for which an explicit extractor isn't defined,
# by using Tika's auto-selection facilities.
#
# author: Nick Burch
# Namespaces
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
# Mappings
author=cm:author
title=cm:title
description=cm:description
created=cm:created

View File

@@ -0,0 +1,185 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.metadata;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.Serializable;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.namespace.QName;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.dwg.DWGParser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.parser.mp3.Mp3Parser;
import org.apache.tika.parser.odf.OpenDocumentParser;
/**
* @see TikaAutoMetadataExtracter
*
* @author Nick Burch
*/
public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
{
private TikaAutoMetadataExtracter extracter;
private static final QName TIKA_MIMETYPE_TEST_PROPERTY =
QName.createQName("TikaMimeTypeTestProp");
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = new TikaAutoMetadataExtracter();
extracter.setDictionaryService(dictionaryService);
extracter.register();
// Attach some extra mappings, using the Tika
// metadata keys namespace
// These will be tested later
HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
extracter.getMapping()
);
Set<QName> tlaSet = new HashSet<QName>();
tlaSet.add(TIKA_MIMETYPE_TEST_PROPERTY);
newMap.put( Metadata.CONTENT_TYPE, tlaSet );
extracter.setMapping(newMap);
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected MetadataExtracter getExtracter()
{
return extracter;
}
public void testSupports() throws Exception
{
ArrayList<String> mimeTypes = new ArrayList<String>();
for (Parser p : new Parser[] {
new OfficeParser(), new OpenDocumentParser(),
new Mp3Parser(), new OOXMLParser()
}) {
Set<MediaType> mts = p.getSupportedTypes(new ParseContext());
for (MediaType mt : mts) {
mimeTypes.add(mt.toString());
}
}
for (String mimetype : mimeTypes)
{
boolean supports = extracter.isSupported(mimetype);
assertTrue("Mimetype should be supported: " + mimetype, supports);
}
}
/**
* Test several different files
* Note - doesn't use extractFromMimetype
*/
public void testSupportedMimetypes() throws Exception
{
String[] testFiles = new String[] {
".doc", ".docx", ".xls", ".xlsx",
".ppt", ".pptx",
//".vsd", // Not auto-detected properly yet
//"2010.dwg", // Not auto-detected properly yet
".pdf",
".odt"
};
for (String fileBase : testFiles)
{
String filename = "quick" + fileBase;
URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
File file = new File(url.getFile());
// Cheat and ask Tika for the mime type!
AutoDetectParser ap = new AutoDetectParser();
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
MediaType mt = ap.getDetector().detect(
new BufferedInputStream(new FileInputStream(file)), metadata);
String mimetype = mt.toString();
// Have it processed
Map<QName, Serializable> properties = extractFromFile(file, mimetype);
// check we got something
assertFalse("extractFromMimetype should return at least some properties, " +
"none found for " + mimetype + " - " + filename,
properties.isEmpty());
// check common metadata
testCommonMetadata(mimetype, properties);
// check file-type specific metadata
testFileSpecificMetadata(mimetype, properties);
}
}
@Override
protected boolean skipAuthorCheck(String mimetype) { return true; }
/**
* We also provide the creation date - check that
*/
protected void testFileSpecificMetadata(String mimetype,
Map<QName, Serializable> properties) {
// Check for extra fields
// Author isn't there for the OpenDocument ones
if(mimetype.indexOf(".oasis.") == -1) {
assertEquals(
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
"Nevin Nollop",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
}
// Ensure that we can also get things which are standard
// Tika metadata properties, if we so choose to
assertTrue(
"Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype,
properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY)
);
// TODO - uncomment this when TIKA-391 is properly fixed
// assertEquals(
// "Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
// mimetype,
// DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
}
}

View File

@@ -35,6 +35,7 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser; import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.BodyContentHandler;
@@ -136,14 +137,13 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
} }
/** /**
* Does auto-detection to select the best Tika * Returns the correct Tika Parser to process
* Parser. * the document.
* Implementations can override this if they * If you don't know which you want, use
* know their specific implementations. * {@link TikaAutoMetadataExtracter} which
* makes use of the Tika auto-detection.
*/ */
protected Parser getParser() { protected abstract Parser getParser();
return null;
}
/** /**
* Allows implementation specific mappings * Allows implementation specific mappings

View File

@@ -1,13 +0,0 @@
#
# TikaPoweredMetadataExtracter - default mapping
#
# author: Nick Burch
# Namespaces
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
# Mappings
author=cm:author
title=cm:title
description=cm:description
created=cm:created