Tika for metadata extraction

Convert some more metadata extractors to using Tika, and enable the use of 
 the Tika auto-detection parser on any documents without an explicitly
 defined extractor.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20667 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-06-16 14:09:46 +00:00
parent b08d9ff412
commit 0e19812dbc
11 changed files with 354 additions and 184 deletions

View File

@@ -18,23 +18,15 @@
*/
package org.alfresco.repo.content.metadata;
import java.io.File;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import java.util.ArrayList;
import java.util.Map;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.util.TempFileProvider;
import org.farng.mp3.AbstractMP3FragmentBody;
import org.farng.mp3.MP3File;
import org.farng.mp3.id3.AbstractID3v2;
import org.farng.mp3.id3.AbstractID3v2Frame;
import org.farng.mp3.id3.ID3v1;
import org.farng.mp3.lyrics3.AbstractLyrics3;
import org.farng.mp3.lyrics3.Lyrics3v2;
import org.farng.mp3.lyrics3.Lyrics3v2Field;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.mp3.Mp3Parser;
/**
* Extracts the following values from MP3 files:
@@ -51,13 +43,15 @@ import org.farng.mp3.lyrics3.Lyrics3v2Field;
* <b>lyrics:</b> -- {music}lyrics
* </pre>
*
* TIKA Note - title and author go in metadata, but much of the
* rest is only in the text. Some of the ID3v2 parts
* (composer, lyrics) are not yet implemented.
* TODO Get hold of a mp3 file with some lyrics in it, so we
* can contribute the patch to Tika
*
* Uses Apache Tika
*
* @author Nick Burch
* @author Roy Wetherall
*/
public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter
{
private static final String KEY_SONG_TITLE = "songTitle";
private static final String KEY_ALBUM_TITLE = "albumTitle";
@@ -70,173 +64,67 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
private static final String KEY_COMPOSER = "composer";
private static final String KEY_LYRICS = "lyrics";
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_MP3 };
public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
new String[] { MimetypeMap.MIMETYPE_MP3 },
new Mp3Parser()
);
public MP3MetadataExtracter()
{
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
super(SUPPORTED_MIMETYPES);
}
@Override
protected Parser getParser() {
return new Mp3Parser();
}
@Override
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
Map<String, Serializable> rawProperties = newRawMap();
// Create a temp file
File tempFile = TempFileProvider.createTempFile("MP3MetadataExtracter_", ".tmp");
try
{
reader.getContent(tempFile);
// Create the MP3 object from the file
// Open it read only as we won't make any changes
MP3File mp3File = new MP3File(tempFile, false);
ID3v1 id3v1 = mp3File.getID3v1Tag();
if (id3v1 != null)
{
putRawValue(KEY_ALBUM_TITLE, id3v1.getAlbum(), rawProperties);
putRawValue(KEY_SONG_TITLE, id3v1.getTitle(), rawProperties);
putRawValue(KEY_ARTIST, id3v1.getArtist(), rawProperties);
putRawValue(KEY_COMMENT, id3v1.getComment(), rawProperties);
putRawValue(KEY_YEAR_RELEASED, id3v1.getYear(), rawProperties);
// TODO sort out the genre
//putRawValue(MusicModel.KEY_GENRE, id3v1.getGenre());
// TODO sort out the size
//putRawValue(MusicModel.KEY_SIZE, id3v1.getSize());
}
AbstractID3v2 id3v2 = mp3File.getID3v2Tag();
if (id3v2 != null)
{
putRawValue(KEY_SONG_TITLE, getID3V2Value(id3v2, "TIT2"), rawProperties);
putRawValue(KEY_ARTIST, getID3V2Value(id3v2, "TPE1"), rawProperties);
putRawValue(KEY_ALBUM_TITLE, getID3V2Value(id3v2, "TALB"), rawProperties);
putRawValue(KEY_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC"), rawProperties);
putRawValue(KEY_COMMENT, getID3V2Value(id3v2, "COMM"), rawProperties);
putRawValue(KEY_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK"), rawProperties);
putRawValue(KEY_GENRE, getID3V2Value(id3v2, "TCON"), rawProperties);
putRawValue(KEY_COMPOSER, getID3V2Value(id3v2, "TCOM"), rawProperties);
// TODO sort out the lyrics
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT"));
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT"));
}
AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag();
if (lyrics3Tag != null)
{
System.out.println("Lyrics3 tag found.");
if (lyrics3Tag instanceof Lyrics3v2)
{
putRawValue(KEY_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2"), rawProperties);
putRawValue(KEY_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1"), rawProperties);
putRawValue(KEY_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB"), rawProperties);
putRawValue(KEY_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM"), rawProperties);
putRawValue(KEY_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT"), rawProperties);
putRawValue(KEY_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM"), rawProperties);
}
}
}
catch(Exception e)
{
if (logger.isDebugEnabled())
{
logger.debug(
"MP3 Metadata extraction failed: \n" +
" Content: " + reader,
e);
}
else
{
logger.warn(
"MP3 Metadata extraction failed (turn on DEBUG for full error): \n" +
" Content: " + reader + "\n" +
" Failure: " + e.getMessage());
}
}
finally
{
tempFile.delete();
}
String description = getDescription(rawProperties);
if (description != null)
{
putRawValue(KEY_DESCRIPTION, description, rawProperties);
}
// Done
return rawProperties;
protected Map<String, Serializable> extractSpecific(Metadata metadata,
Map<String, Serializable> properties) {
putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties);
putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties);
putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties);
putRawValue(KEY_GENRE, metadata.get(XMPDM.GENRE), properties);
putRawValue(KEY_YEAR_RELEASED, metadata.get(XMPDM.RELEASE_DATE), properties);
putRawValue(KEY_COMPOSER, metadata.get(XMPDM.COMPOSER), properties);
// TODO lyrics
//putRawValue(KEY_LYRICS, getLyrics(), properties);
putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
return properties;
}
/**
* Generate the description
*
* @param props the properties extracted from the file
* @return the description
*/
private String getDescription(Map<String, Serializable> props)
private String generateDescription(Metadata metadata)
{
StringBuilder result = new StringBuilder();
if (props.get(KEY_SONG_TITLE) != null)
if (metadata.get(Metadata.TITLE) != null)
{
result.append(props.get(KEY_SONG_TITLE));
if (props.get(KEY_ALBUM_TITLE) != null)
result.append(metadata.get(Metadata.TITLE));
if (metadata.get(XMPDM.ALBUM) != null)
{
result
.append(" - ")
.append(props.get(KEY_ALBUM_TITLE));
.append(metadata.get(XMPDM.ALBUM));
}
if (props.get(KEY_ARTIST) != null)
if (metadata.get(XMPDM.ARTIST) != null)
{
result
.append(" (")
.append(props.get(KEY_ARTIST))
.append(metadata.get(XMPDM.ARTIST))
.append(")");
}
}
return result.toString();
}
private String getLyrics3v2Value(Lyrics3v2 lyrics3Tag, String name)
{
String result = "";
Lyrics3v2Field field = lyrics3Tag.getField(name);
if (field != null)
{
AbstractMP3FragmentBody body = field.getBody();
if (body != null)
{
result = (String)body.getObject("Text");
}
}
return result;
}
/**
* Get the ID3V2 tag value in a safe way
*/
private String getID3V2Value(AbstractID3v2 id3v2, String name)
{
String result = "";
AbstractID3v2Frame frame = id3v2.getFrame(name);
if (frame != null)
{
AbstractMP3FragmentBody body = frame.getBody();
if (body != null)
{
result = (String)body.getObject("Text");
}
}
return result;
}
}