Update the MP3 extractor to output audio keys (related to ALF-6170), and refactor the audio extractors to share more common code. Also expands the audio extractor tests to share common code, and test more metadata. (Needed for devcon demo)

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@31013 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2011-10-06 16:17:06 +00:00
parent e180abafe4
commit 9f60d2b246
11 changed files with 438 additions and 88 deletions

View File

@@ -224,7 +224,6 @@
<bean id="extracter.Office" class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.Mail" class="org.alfresco.repo.content.metadata.MailMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.Html" class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.MP3" class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.OpenDocument" class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.DWG" class="org.alfresco.repo.content.metadata.DWGMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.RFC822" class="org.alfresco.repo.content.metadata.RFC822MetadataExtracter" parent="baseMetadataExtracter" >
@@ -235,6 +234,12 @@
</list>
</property>
</bean>
<bean id="extracter.MP3" class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" parent="baseMetadataExtracter">
<property name="tikaConfig" ref="tikaConfig"/>
</bean>
<bean id="extracter.Audio" class="org.alfresco.repo.content.metadata.TikaAudioMetadataExtracter" parent="baseMetadataExtracter">
<property name="tikaConfig" ref="tikaConfig"/>
</bean>
<bean id="extracter.OpenOffice" class="org.alfresco.repo.content.metadata.OpenOfficeMetadataExtracter" parent="baseMetadataExtracter">
<property name="worker">
<ref bean="extracter.worker.OpenOffice" />

View File

@@ -662,7 +662,16 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
// Ask Tika to detect the document, and report back on if
// the current mime type is plausible
String typeErrorMessage = null;
String differentType = mimetypeService.getMimetypeIfNotMatches(reader.getReader());
String differentType = null;
if(mimetypeService != null)
{
differentType = mimetypeService.getMimetypeIfNotMatches(reader.getReader());
}
else
{
logger.info("Unable to verify mimetype of " + reader.getReader() +
" as no MimetypeService available to " + getClass().getName());
}
if(differentType != null)
{
typeErrorMessage = "\n" +

View File

@@ -31,38 +31,35 @@ import org.apache.tika.parser.mp3.Mp3Parser;
/**
* Extracts the following values from MP3 files:
* <pre>
* <b>songTitle:</b> -- {music}songTitle, cm:title
* <b>albumTitle:</b> -- {music}albumTitle
* <b>artist:</b> -- {music}artist, cm:author
* <b>songTitle:</b> -- cm:title
* <b>albumTitle:</b> -- audio:album
* <b>artist:</b> -- audio:artist, cm:author
* <b>description:</b> -- cm:description
* <b>comment:</b> -- {music}comment
* <b>yearReleased:</b> -- {music}yearReleased
* <b>trackNumber:</b> -- {music}trackNumber
* <b>genre:</b> -- {music}genre
* <b>composer:</b> -- {music}composer
* <b>lyrics:</b> -- {music}lyrics
* <b>comment:</b> --
* <b>yearReleased:</b> -- audio:releaseDate
* <b>trackNumber:</b> -- audio:trackNumber
* <b>genre:</b> -- audio:genre
* <b>composer:</b> -- audio:composer
* <b>lyrics:</b> --
* </pre>
*
* TODO Get hold of a mp3 file with some lyrics in it, so we
* can contribute the patch to Tika
* Note - XMPDM metadata keys are also emitted, in common with
* the other Tika powered extracters
*
* Uses Apache Tika
*
* @author Nick Burch
* @author Roy Wetherall
*/
public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter
public class MP3MetadataExtracter extends TikaAudioMetadataExtracter
{
private static final String KEY_SONG_TITLE = "songTitle";
private static final String KEY_ALBUM_TITLE = "albumTitle";
private static final String KEY_ARTIST = "artist";
private static final String KEY_DESCRIPTION = "description";
private static final String KEY_COMMENT = "comment";
private static final String KEY_YEAR_RELEASED = "yearReleased";
private static final String KEY_TRACK_NUMBER = "trackNumber";
private static final String KEY_GENRE = "genre";
private static final String KEY_COMPOSER = "composer";
private static final String KEY_LYRICS = "lyrics";
public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
new String[] { MimetypeMap.MIMETYPE_MP3 },
@@ -82,6 +79,12 @@ public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
Map<String, Serializable> properties, Map<String,String> headers) {
// Do the normal Audio mappings
super.extractSpecific(metadata, properties, headers);
// Now do the compatibility ones
// We only need these for people who had pre-existing mapping
// properties from before the proper audio model was added
putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties);
putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
@@ -90,41 +93,8 @@ public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter
putRawValue(KEY_GENRE, metadata.get(XMPDM.GENRE), properties);
putRawValue(KEY_YEAR_RELEASED, metadata.get(XMPDM.RELEASE_DATE), properties);
putRawValue(KEY_COMPOSER, metadata.get(XMPDM.COMPOSER), properties);
// TODO lyrics
//putRawValue(KEY_LYRICS, getLyrics(), properties);
putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
// All done
return properties;
}
/**
* Generate the description
*
* @param props the properties extracted from the file
* @return the description
*/
private String generateDescription(Metadata metadata)
{
StringBuilder result = new StringBuilder();
if (metadata.get(Metadata.TITLE) != null)
{
result.append(metadata.get(Metadata.TITLE));
if (metadata.get(XMPDM.ALBUM) != null)
{
result
.append(" - ")
.append(metadata.get(XMPDM.ALBUM));
}
if (metadata.get(XMPDM.ARTIST) != null)
{
result
.append(" (")
.append(metadata.get(XMPDM.ARTIST))
.append(")");
}
}
return result.toString();
}
}

View File

@@ -5,8 +5,26 @@
# Namespaces
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
# Mappings
songTitle=cm:title
artist=cm:author
# Core mappings
author=cm:author
title=cm:title
description=cm:description
created=cm:created
# Audio descriptive mappings
xmpDM\:album=audio:album
xmpDM\:artist=audio:artist
xmpDM\:composer=audio:composer
xmpDM\:engineer=audio:engineer
xmpDM\:genre=audio:genre
xmpDM\:trackNumber=audio:trackNumber
xmpDM\:releaseDate=audio:releaseDate
#xmpDM:logComment
# Audio specific mappings
xmpDM\:audioSampleRate=audio:sampleRate
xmpDM\:audioSampleType=audio:sampleType
xmpDM\:audioChannelType=audio:channelType
xmpDM\:audioCompressor=audio:compressor

View File

@@ -29,17 +29,15 @@ import org.alfresco.service.namespace.QName;
/**
* Test for the MP3 metadata extraction from id3 tags.
*/
public class MP3MetadataExtracterTest extends AbstractMetadataExtracterTest
public class MP3MetadataExtracterTest extends TikaAudioMetadataExtracterTest
{
private MP3MetadataExtracter extracter;
private static final String ARTIST = "Hauskaz";
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = new MP3MetadataExtracter();
extracter.setDictionaryService(dictionaryService);
extracter = (MP3MetadataExtracter)ctx.getBean("extracter.MP3");
extracter.register();
}
@@ -64,6 +62,10 @@ public class MP3MetadataExtracterTest extends AbstractMetadataExtracterTest
{
testExtractFromMimetype(MimetypeMap.MIMETYPE_MP3);
}
@Override
public void testOggExtraction() throws Exception {}
@Override
public void testFlacExtraction() throws Exception {}
/**
* We don't have quite the usual metadata. Tests the descriptions one.
@@ -93,23 +95,6 @@ public class MP3MetadataExtracterTest extends AbstractMetadataExtracterTest
* Tests for various MP3 specific bits of metadata
*/
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {
// Pending ALF-6170 for proper music namespace
// QName songTitle = QName.createQName("music","songTitle");
// assertEquals(
// "Property " + songTitle + " not found for mimetype " + mimetype,
// QUICK_TITLE,
// DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(songTitle)));
//
// QName songArtist = QName.createQName("music","artist");
// assertEquals(
// "Property " + songArtist + " not found for mimetype " + mimetype,
// ARTIST,
// DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(songArtist)));
// Description is a composite - check the artist part
assertContains(
"Property " + ContentModel.PROP_DESCRIPTION + " didn't contain " + ARTIST + " for mimetype " + mimetype,
ARTIST,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
super.testFileSpecificMetadata(mimetype, properties);
}
}

View File

@@ -0,0 +1,172 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.Map;
import org.alfresco.repo.content.MimetypeMap;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
import org.gagravarr.tika.FlacParser;
import org.gagravarr.tika.VorbisParser;
/**
* A Metadata Extractor which makes use of the Apache
* Tika Audio Parsers to extract metadata from your
* media files.
* For backwards compatibility reasons, this doesn't
* handle the MP3 format, which has its own dedicated
* extractor in {@link MP3MetadataExtracter}
* <pre>
* <b>author:</b> -- cm:author
* <b>title:</b> -- cm:title
* <b>created:</b> -- cm:created
* <b>xmpDM:artist</b> -- audio:artist
* <b>xmpDM:composer</b> -- audio:composer
* <b>xmpDM:engineer</b> -- audio:engineer
* <b>xmpDM:genre</b> -- audio:genre
* <b>xmpDM:trackNumber</b> -- audio:trackNumber
* <b>xmpDM:releaseDate</b> -- audio:releaseDate
* </pre>
*
* @author Nick Burch
*/
public class TikaAudioMetadataExtracter extends TikaPoweredMetadataExtracter
{
protected static final String KEY_LYRICS = "lyrics";
private static Parser[] parsers = new Parser[] {
new VorbisParser(),
new FlacParser()
};
public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
new String[] { MimetypeMap.MIMETYPE_VORBIS, MimetypeMap.MIMETYPE_FLAC },
parsers
);
protected TikaConfig tikaConfig;
public void setTikaConfig(TikaConfig tikaConfig)
{
this.tikaConfig = tikaConfig;
}
public TikaAudioMetadataExtracter()
{
this(SUPPORTED_MIMETYPES);
}
public TikaAudioMetadataExtracter(ArrayList<String> supportedMimeTypes)
{
super(supportedMimeTypes);
}
@Override
protected Parser getParser() {
return new CompositeParser(
tikaConfig.getMediaTypeRegistry(), parsers
);
}
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
Map<String, Serializable> properties, Map<String,String> headers) {
// Most things can go with the default Tika -> Alfresco Mapping
// Handle the few special cases here
// The description is special
putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
// The release date can be fiddly
Date releaseDate = generateReleaseDate(metadata);
putRawValue(KEY_CREATED, releaseDate, properties);
putRawValue(XMPDM.RELEASE_DATE.getName(), releaseDate, properties);
// TODO Get the Lyrics from the content
//putRawValue(KEY_LYRICS, getLyrics(), properties);
// All done
return properties;
}
/**
* Generates the release date
*/
private Date generateReleaseDate(Metadata metadata)
{
String date = metadata.get(XMPDM.RELEASE_DATE);
if(date == null || date.length() == 0)
{
return null;
}
// Is it just a year?
if(date.matches("\\d\\d\\d\\d"))
{
// Just a year, we need a full date
// Go for the 1st of the 1st
Calendar c = Calendar.getInstance();
c.set(
Integer.parseInt(date), Calendar.JANUARY, 1,
0, 0, 0
);
c.set(Calendar.MILLISECOND, 0);
return c.getTime();
}
// Treat as a normal date
return makeDate(date);
}
/**
* Generate the description
*
* @param props the properties extracted from the file
* @return the description
*/
private String generateDescription(Metadata metadata)
{
StringBuilder result = new StringBuilder();
if (metadata.get(Metadata.TITLE) != null)
{
result.append(metadata.get(Metadata.TITLE));
if (metadata.get(XMPDM.ALBUM) != null)
{
result
.append(" - ")
.append(metadata.get(XMPDM.ALBUM));
}
if (metadata.get(XMPDM.ARTIST) != null)
{
result
.append(" (")
.append(metadata.get(XMPDM.ARTIST))
.append(")");
}
}
return result.toString();
}
}

View File

@@ -0,0 +1,34 @@
#
# TikaAudioMetadataExtracter - audio mapping
#
# This is used to map from the Tika audio metadata onto your
# content model. This will be used for any Audio content
# for which an explicit extractor isn't defined
#
# author: Nick Burch
# Namespaces
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
# Core mappings
author=cm:author
title=cm:title
description=cm:description
created=cm:created
# Audio descriptive mappings
xmpDM\:album=audio:album
xmpDM\:artist=audio:artist
xmpDM\:composer=audio:composer
xmpDM\:engineer=audio:engineer
xmpDM\:genre=audio:genre
xmpDM\:trackNumber=audio:trackNumber
xmpDM\:releaseDate=audio:releaseDate
#xmpDM:logComment
# Audio specific mappings
xmpDM\:audioSampleRate=audio:sampleRate
xmpDM\:audioSampleType=audio:sampleType
xmpDM\:audioChannelType=audio:channelType
xmpDM\:audioCompressor=audio:compressor

View File

@@ -0,0 +1,139 @@
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.namespace.NamespaceService;
import org.alfresco.service.namespace.QName;
/**
* Test for the audio metadata extraction.
*/
public class TikaAudioMetadataExtracterTest extends AbstractMetadataExtracterTest
{
private TikaAudioMetadataExtracter extracter;
private static final String ARTIST = "Hauskaz";
private static final String ALBUM = "About a dog and a fox";
private static final String GENRE = "Foxtrot";
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = (TikaAudioMetadataExtracter)ctx.getBean("extracter.Audio");
extracter.register();
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected MetadataExtracter getExtracter()
{
return extracter;
}
public void testSupports() throws Exception
{
for (String mimetype : TikaAudioMetadataExtracter.SUPPORTED_MIMETYPES)
{
boolean supports = extracter.isSupported(mimetype);
assertTrue("Mimetype should be supported: " + mimetype, supports);
}
}
public void testOggExtraction() throws Exception
{
testExtractFromMimetype(MimetypeMap.MIMETYPE_VORBIS);
}
public void testFlacExtraction() throws Exception
{
testExtractFromMimetype(MimetypeMap.MIMETYPE_FLAC);
}
/**
* We don't have quite the usual metadata. Tests the descriptions one.
* Other tests in {@link #testFileSpecificMetadata(String, Map)}
*/
protected void testCommonMetadata(String mimetype, Map<QName, Serializable> properties) {
// Title is as normal
assertEquals(
"Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
QUICK_TITLE,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE)));
// Has Author, not Creator, and is different
assertEquals(
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
"Hauskaz",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
// Description is a composite
assertContains(
"Property " + ContentModel.PROP_DESCRIPTION + " didn't contain " + QUICK_TITLE + " for mimetype " + mimetype,
QUICK_TITLE,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
// Check rest of it later
}
/**
* Tests for various Audio specific bits of metadata
*/
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {
QName album = QName.createQName(NamespaceService.AUDIO_MODEL_1_0_URI, "album");
assertEquals(
"Property " + album + " not found for mimetype " + mimetype,
ALBUM,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(album)));
QName artist = QName.createQName(NamespaceService.AUDIO_MODEL_1_0_URI, "artist");
assertEquals(
"Property " + artist + " not found for mimetype " + mimetype,
ARTIST,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(artist)));
QName genre = QName.createQName(NamespaceService.AUDIO_MODEL_1_0_URI, "genre");
assertEquals(
"Property " + genre + " not found for mimetype " + mimetype,
GENRE,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(genre)));
QName releaseDate = QName.createQName(NamespaceService.AUDIO_MODEL_1_0_URI, "releaseDate");
assertEquals(
"Property " + releaseDate + " not found for mimetype " + mimetype,
"2009-01-01T00:00:00.000Z",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(releaseDate)));
QName channels = QName.createQName(NamespaceService.AUDIO_MODEL_1_0_URI, "channelType");
assertEquals(
"Property " + channels + " not found for mimetype " + mimetype,
"Stereo",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(channels)));
// Description is a composite - check the artist part
assertContains(
"Property " + ContentModel.PROP_DESCRIPTION + " didn't contain " + ARTIST + " for mimetype " + mimetype,
ARTIST,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
}
}

View File

@@ -11,6 +11,7 @@
# Namespaces
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
namespace.prefix.exif=http://www.alfresco.org/model/exif/1.0
namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
# Mappings
author=cm:author
@@ -36,3 +37,16 @@ exif\:FNumber=exif:fNumber
exif\:FocalLength=exif:focalLength
exif\:IsoSpeedRatings=exif:isoSpeedRatings
exif\:DateTimeOriginal=exif:dateTimeOriginal
xmpDM\:album=audio:album
xmpDM\:artist=audio:artist
xmpDM\:composer=audio:composer
xmpDM\:engineer=audio:engineer
xmpDM\:genre=audio:genre
xmpDM\:trackNumber=audio:trackNumber
xmpDM\:releaseDate=audio:releaseDate
#xmpDM:logComment
xmpDM\:audioSampleRate=audio:sampleRate
xmpDM\:audioSampleType=audio:sampleType
xmpDM\:audioChannelType=audio:channelType
xmpDM\:audioCompressor=audio:compressor

View File

@@ -124,7 +124,6 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
//"2010.dwg", // Not auto-detected properly yet
".pdf",
".odt",
".ogg"
};
for (String fileBase : testFiles)

View File

@@ -88,14 +88,16 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
* Builds up a list of supported mime types by merging an explicit
* list with any that Tika also claims to support
*/
protected static ArrayList<String> buildSupportedMimetypes(String[] explicitTypes, Parser tikaParser) {
protected static ArrayList<String> buildSupportedMimetypes(String[] explicitTypes, Parser... tikaParsers) {
ArrayList<String> types = new ArrayList<String>();
for(String type : explicitTypes) {
if(!types.contains(type)) {
types.add(type);
}
}
if(tikaParser != null) {
if(tikaParsers != null) {
for(Parser tikaParser : tikaParsers)
{
for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext())) {
String type = mt.toString();
if(!types.contains(type)) {
@@ -103,6 +105,7 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
}
}
}
}
return types;
}
@@ -225,9 +228,11 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
{
is = getInputStream(reader);
Parser parser = getParser();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, reader.getMimetype());
ContentHandler handler;
Map<String,String> headers = null;
if(needHeaderContents()) {