mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Tika for metadata extraction
Convert some more metadata extractors to using Tika, and enable the use of the Tika auto-detection parser on any documents without an explicitly defined extractor. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20667 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -18,23 +18,15 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
import org.farng.mp3.AbstractMP3FragmentBody;
|
||||
import org.farng.mp3.MP3File;
|
||||
import org.farng.mp3.id3.AbstractID3v2;
|
||||
import org.farng.mp3.id3.AbstractID3v2Frame;
|
||||
import org.farng.mp3.id3.ID3v1;
|
||||
import org.farng.mp3.lyrics3.AbstractLyrics3;
|
||||
import org.farng.mp3.lyrics3.Lyrics3v2;
|
||||
import org.farng.mp3.lyrics3.Lyrics3v2Field;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.XMPDM;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.mp3.Mp3Parser;
|
||||
|
||||
/**
|
||||
* Extracts the following values from MP3 files:
|
||||
@@ -51,13 +43,15 @@ import org.farng.mp3.lyrics3.Lyrics3v2Field;
|
||||
* <b>lyrics:</b> -- {music}lyrics
|
||||
* </pre>
|
||||
*
|
||||
* TIKA Note - title and author go in metadata, but much of the
|
||||
* rest is only in the text. Some of the ID3v2 parts
|
||||
* (composer, lyrics) are not yet implemented.
|
||||
* TODO Get hold of a mp3 file with some lyrics in it, so we
|
||||
* can contribute the patch to Tika
|
||||
*
|
||||
* Uses Apache Tika
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author Roy Wetherall
|
||||
*/
|
||||
public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter
|
||||
{
|
||||
private static final String KEY_SONG_TITLE = "songTitle";
|
||||
private static final String KEY_ALBUM_TITLE = "albumTitle";
|
||||
@@ -70,173 +64,67 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
private static final String KEY_COMPOSER = "composer";
|
||||
private static final String KEY_LYRICS = "lyrics";
|
||||
|
||||
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_MP3 };
|
||||
public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
|
||||
new String[] { MimetypeMap.MIMETYPE_MP3 },
|
||||
new Mp3Parser()
|
||||
);
|
||||
|
||||
public MP3MetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser() {
|
||||
return new Mp3Parser();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
||||
{
|
||||
Map<String, Serializable> rawProperties = newRawMap();
|
||||
|
||||
// Create a temp file
|
||||
File tempFile = TempFileProvider.createTempFile("MP3MetadataExtracter_", ".tmp");
|
||||
try
|
||||
{
|
||||
reader.getContent(tempFile);
|
||||
|
||||
// Create the MP3 object from the file
|
||||
// Open it read only as we won't make any changes
|
||||
MP3File mp3File = new MP3File(tempFile, false);
|
||||
|
||||
ID3v1 id3v1 = mp3File.getID3v1Tag();
|
||||
if (id3v1 != null)
|
||||
{
|
||||
putRawValue(KEY_ALBUM_TITLE, id3v1.getAlbum(), rawProperties);
|
||||
putRawValue(KEY_SONG_TITLE, id3v1.getTitle(), rawProperties);
|
||||
putRawValue(KEY_ARTIST, id3v1.getArtist(), rawProperties);
|
||||
putRawValue(KEY_COMMENT, id3v1.getComment(), rawProperties);
|
||||
putRawValue(KEY_YEAR_RELEASED, id3v1.getYear(), rawProperties);
|
||||
|
||||
// TODO sort out the genre
|
||||
//putRawValue(MusicModel.KEY_GENRE, id3v1.getGenre());
|
||||
|
||||
// TODO sort out the size
|
||||
//putRawValue(MusicModel.KEY_SIZE, id3v1.getSize());
|
||||
}
|
||||
|
||||
AbstractID3v2 id3v2 = mp3File.getID3v2Tag();
|
||||
if (id3v2 != null)
|
||||
{
|
||||
putRawValue(KEY_SONG_TITLE, getID3V2Value(id3v2, "TIT2"), rawProperties);
|
||||
putRawValue(KEY_ARTIST, getID3V2Value(id3v2, "TPE1"), rawProperties);
|
||||
putRawValue(KEY_ALBUM_TITLE, getID3V2Value(id3v2, "TALB"), rawProperties);
|
||||
putRawValue(KEY_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC"), rawProperties);
|
||||
putRawValue(KEY_COMMENT, getID3V2Value(id3v2, "COMM"), rawProperties);
|
||||
putRawValue(KEY_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK"), rawProperties);
|
||||
putRawValue(KEY_GENRE, getID3V2Value(id3v2, "TCON"), rawProperties);
|
||||
putRawValue(KEY_COMPOSER, getID3V2Value(id3v2, "TCOM"), rawProperties);
|
||||
|
||||
// TODO sort out the lyrics
|
||||
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT"));
|
||||
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT"));
|
||||
}
|
||||
|
||||
AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag();
|
||||
if (lyrics3Tag != null)
|
||||
{
|
||||
System.out.println("Lyrics3 tag found.");
|
||||
if (lyrics3Tag instanceof Lyrics3v2)
|
||||
{
|
||||
putRawValue(KEY_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2"), rawProperties);
|
||||
putRawValue(KEY_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1"), rawProperties);
|
||||
putRawValue(KEY_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB"), rawProperties);
|
||||
putRawValue(KEY_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM"), rawProperties);
|
||||
putRawValue(KEY_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT"), rawProperties);
|
||||
putRawValue(KEY_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM"), rawProperties);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
catch(Exception e)
|
||||
{
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug(
|
||||
"MP3 Metadata extraction failed: \n" +
|
||||
" Content: " + reader,
|
||||
e);
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.warn(
|
||||
"MP3 Metadata extraction failed (turn on DEBUG for full error): \n" +
|
||||
" Content: " + reader + "\n" +
|
||||
" Failure: " + e.getMessage());
|
||||
}
|
||||
|
||||
}
|
||||
finally
|
||||
{
|
||||
tempFile.delete();
|
||||
}
|
||||
|
||||
String description = getDescription(rawProperties);
|
||||
if (description != null)
|
||||
{
|
||||
putRawValue(KEY_DESCRIPTION, description, rawProperties);
|
||||
}
|
||||
|
||||
// Done
|
||||
return rawProperties;
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties) {
|
||||
putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
|
||||
putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties);
|
||||
putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
|
||||
putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties);
|
||||
putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties);
|
||||
putRawValue(KEY_GENRE, metadata.get(XMPDM.GENRE), properties);
|
||||
putRawValue(KEY_YEAR_RELEASED, metadata.get(XMPDM.RELEASE_DATE), properties);
|
||||
putRawValue(KEY_COMPOSER, metadata.get(XMPDM.COMPOSER), properties);
|
||||
// TODO lyrics
|
||||
//putRawValue(KEY_LYRICS, getLyrics(), properties);
|
||||
|
||||
putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
|
||||
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Generate the description
|
||||
*
|
||||
* @param props the properties extracted from the file
|
||||
* @return the description
|
||||
*/
|
||||
private String getDescription(Map<String, Serializable> props)
|
||||
private String generateDescription(Metadata metadata)
|
||||
{
|
||||
StringBuilder result = new StringBuilder();
|
||||
if (props.get(KEY_SONG_TITLE) != null)
|
||||
if (metadata.get(Metadata.TITLE) != null)
|
||||
{
|
||||
result.append(props.get(KEY_SONG_TITLE));
|
||||
if (props.get(KEY_ALBUM_TITLE) != null)
|
||||
result.append(metadata.get(Metadata.TITLE));
|
||||
if (metadata.get(XMPDM.ALBUM) != null)
|
||||
{
|
||||
result
|
||||
.append(" - ")
|
||||
.append(props.get(KEY_ALBUM_TITLE));
|
||||
.append(metadata.get(XMPDM.ALBUM));
|
||||
}
|
||||
if (props.get(KEY_ARTIST) != null)
|
||||
if (metadata.get(XMPDM.ARTIST) != null)
|
||||
{
|
||||
result
|
||||
.append(" (")
|
||||
.append(props.get(KEY_ARTIST))
|
||||
.append(metadata.get(XMPDM.ARTIST))
|
||||
.append(")");
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
private String getLyrics3v2Value(Lyrics3v2 lyrics3Tag, String name)
|
||||
{
|
||||
String result = "";
|
||||
Lyrics3v2Field field = lyrics3Tag.getField(name);
|
||||
if (field != null)
|
||||
{
|
||||
AbstractMP3FragmentBody body = field.getBody();
|
||||
if (body != null)
|
||||
{
|
||||
result = (String)body.getObject("Text");
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the ID3V2 tag value in a safe way
|
||||
*/
|
||||
private String getID3V2Value(AbstractID3v2 id3v2, String name)
|
||||
{
|
||||
String result = "";
|
||||
|
||||
AbstractID3v2Frame frame = id3v2.getFrame(name);
|
||||
if (frame != null)
|
||||
{
|
||||
AbstractMP3FragmentBody body = frame.getBody();
|
||||
if (body != null)
|
||||
{
|
||||
result = (String)body.getObject("Text");
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user