mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Tika for metadata extraction
Convert some more metadata extractors to using Tika, and enable the use of the Tika auto-detection parser on any documents without an explicitly defined extractor. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20667 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -200,13 +200,15 @@
|
|||||||
</bean>
|
</bean>
|
||||||
|
|
||||||
<!-- Content Metadata Extractors -->
|
<!-- Content Metadata Extractors -->
|
||||||
|
<!-- The last one listed for any mimetype will be used if available -->
|
||||||
|
<!-- As such, the Tika auto-detect fallback should be listed first -->
|
||||||
|
<bean id="extracter.TikaAuto" class="org.alfresco.repo.content.metadata.TikaAutoMetadataExtracter" parent="baseMetadataExtracter" />
|
||||||
<bean id="extracter.PDFBox" class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" parent="baseMetadataExtracter" />
|
<bean id="extracter.PDFBox" class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" parent="baseMetadataExtracter" />
|
||||||
<bean id="extracter.Poi" class="org.alfresco.repo.content.metadata.PoiMetadataExtracter" parent="baseMetadataExtracter" />
|
<bean id="extracter.Poi" class="org.alfresco.repo.content.metadata.PoiMetadataExtracter" parent="baseMetadataExtracter" />
|
||||||
<bean id="extracter.Office" class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" parent="baseMetadataExtracter" />
|
<bean id="extracter.Office" class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" parent="baseMetadataExtracter" />
|
||||||
<bean id="extracter.Mail" class="org.alfresco.repo.content.metadata.MailMetadataExtracter" parent="baseMetadataExtracter" />
|
<bean id="extracter.Mail" class="org.alfresco.repo.content.metadata.MailMetadataExtracter" parent="baseMetadataExtracter" />
|
||||||
<bean id="extracter.Html" class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" parent="baseMetadataExtracter" />
|
<bean id="extracter.Html" class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" parent="baseMetadataExtracter" />
|
||||||
<!-- Unsupported experimental extractor commented out -->
|
<bean id="extracter.MP3" class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" parent="baseMetadataExtracter" />
|
||||||
<!-- <bean id="extracter.MP3" class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" parent="baseMetadataExtracter" /> -->
|
|
||||||
<bean id="extracter.OpenDocument" class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter" parent="baseMetadataExtracter" />
|
<bean id="extracter.OpenDocument" class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter" parent="baseMetadataExtracter" />
|
||||||
<bean id="extracter.DWG" class="org.alfresco.repo.content.metadata.DWGMetadataExtracter" parent="baseMetadataExtracter" />
|
<bean id="extracter.DWG" class="org.alfresco.repo.content.metadata.DWGMetadataExtracter" parent="baseMetadataExtracter" />
|
||||||
<bean id="extracter.RFC822" class="org.alfresco.repo.content.metadata.RFC822MetadataExtracter" parent="baseMetadataExtracter" >
|
<bean id="extracter.RFC822" class="org.alfresco.repo.content.metadata.RFC822MetadataExtracter" parent="baseMetadataExtracter" >
|
||||||
|
@@ -199,6 +199,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
|
||||||
* @return Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
|
* @return Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
|
||||||
*
|
*
|
||||||
* @see #isSupported(String)
|
* @see #isSupported(String)
|
||||||
@@ -209,10 +210,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the policy to use when existing values are encountered. Depending on how the extracer
|
* Set the policy to use when existing values are encountered. Depending on how the extractor
|
||||||
* is called, this may not be relevant, i.e an empty map of existing properties may be passed
|
* is called, this may not be relevant, i.e an empty map of existing properties may be passed
|
||||||
* in by the client code, which may follow its own overwrite strategy.
|
* in by the client code, which may follow its own overwrite strategy.
|
||||||
*
|
*
|
||||||
|
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
|
||||||
* @param overwritePolicy the policy to apply when there are existing system properties
|
* @param overwritePolicy the policy to apply when there are existing system properties
|
||||||
*/
|
*/
|
||||||
public void setOverwritePolicy(OverwritePolicy overwritePolicy)
|
public void setOverwritePolicy(OverwritePolicy overwritePolicy)
|
||||||
@@ -221,10 +223,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the policy to use when existing values are encountered. Depending on how the extracer
|
* Set the policy to use when existing values are encountered. Depending on how the extractor
|
||||||
* is called, this may not be relevant, i.e an empty map of existing properties may be passed
|
* is called, this may not be relevant, i.e an empty map of existing properties may be passed
|
||||||
* in by the client code, which may follow its own overwrite strategy.
|
* in by the client code, which may follow its own overwrite strategy.
|
||||||
*
|
*
|
||||||
|
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
|
||||||
* @param overwritePolicyStr the policy to apply when there are existing system properties
|
* @param overwritePolicyStr the policy to apply when there are existing system properties
|
||||||
*/
|
*/
|
||||||
public void setOverwritePolicy(String overwritePolicyStr)
|
public void setOverwritePolicy(String overwritePolicyStr)
|
||||||
|
@@ -18,23 +18,15 @@
|
|||||||
*/
|
*/
|
||||||
package org.alfresco.repo.content.metadata;
|
package org.alfresco.repo.content.metadata;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Arrays;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.alfresco.repo.content.MimetypeMap;
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.alfresco.util.TempFileProvider;
|
import org.apache.tika.metadata.XMPDM;
|
||||||
import org.farng.mp3.AbstractMP3FragmentBody;
|
import org.apache.tika.parser.Parser;
|
||||||
import org.farng.mp3.MP3File;
|
import org.apache.tika.parser.mp3.Mp3Parser;
|
||||||
import org.farng.mp3.id3.AbstractID3v2;
|
|
||||||
import org.farng.mp3.id3.AbstractID3v2Frame;
|
|
||||||
import org.farng.mp3.id3.ID3v1;
|
|
||||||
import org.farng.mp3.lyrics3.AbstractLyrics3;
|
|
||||||
import org.farng.mp3.lyrics3.Lyrics3v2;
|
|
||||||
import org.farng.mp3.lyrics3.Lyrics3v2Field;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts the following values from MP3 files:
|
* Extracts the following values from MP3 files:
|
||||||
@@ -51,13 +43,15 @@ import org.farng.mp3.lyrics3.Lyrics3v2Field;
|
|||||||
* <b>lyrics:</b> -- {music}lyrics
|
* <b>lyrics:</b> -- {music}lyrics
|
||||||
* </pre>
|
* </pre>
|
||||||
*
|
*
|
||||||
* TIKA Note - title and author go in metadata, but much of the
|
* TODO Get hold of a mp3 file with some lyrics in it, so we
|
||||||
* rest is only in the text. Some of the ID3v2 parts
|
* can contribute the patch to Tika
|
||||||
* (composer, lyrics) are not yet implemented.
|
|
||||||
*
|
*
|
||||||
|
* Uses Apache Tika
|
||||||
|
*
|
||||||
|
* @author Nick Burch
|
||||||
* @author Roy Wetherall
|
* @author Roy Wetherall
|
||||||
*/
|
*/
|
||||||
public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter
|
||||||
{
|
{
|
||||||
private static final String KEY_SONG_TITLE = "songTitle";
|
private static final String KEY_SONG_TITLE = "songTitle";
|
||||||
private static final String KEY_ALBUM_TITLE = "albumTitle";
|
private static final String KEY_ALBUM_TITLE = "albumTitle";
|
||||||
@@ -70,110 +64,39 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
|||||||
private static final String KEY_COMPOSER = "composer";
|
private static final String KEY_COMPOSER = "composer";
|
||||||
private static final String KEY_LYRICS = "lyrics";
|
private static final String KEY_LYRICS = "lyrics";
|
||||||
|
|
||||||
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_MP3 };
|
public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
|
||||||
|
new String[] { MimetypeMap.MIMETYPE_MP3 },
|
||||||
|
new Mp3Parser()
|
||||||
|
);
|
||||||
|
|
||||||
public MP3MetadataExtracter()
|
public MP3MetadataExtracter()
|
||||||
{
|
{
|
||||||
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
|
super(SUPPORTED_MIMETYPES);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
protected Parser getParser() {
|
||||||
{
|
return new Mp3Parser();
|
||||||
Map<String, Serializable> rawProperties = newRawMap();
|
|
||||||
|
|
||||||
// Create a temp file
|
|
||||||
File tempFile = TempFileProvider.createTempFile("MP3MetadataExtracter_", ".tmp");
|
|
||||||
try
|
|
||||||
{
|
|
||||||
reader.getContent(tempFile);
|
|
||||||
|
|
||||||
// Create the MP3 object from the file
|
|
||||||
// Open it read only as we won't make any changes
|
|
||||||
MP3File mp3File = new MP3File(tempFile, false);
|
|
||||||
|
|
||||||
ID3v1 id3v1 = mp3File.getID3v1Tag();
|
|
||||||
if (id3v1 != null)
|
|
||||||
{
|
|
||||||
putRawValue(KEY_ALBUM_TITLE, id3v1.getAlbum(), rawProperties);
|
|
||||||
putRawValue(KEY_SONG_TITLE, id3v1.getTitle(), rawProperties);
|
|
||||||
putRawValue(KEY_ARTIST, id3v1.getArtist(), rawProperties);
|
|
||||||
putRawValue(KEY_COMMENT, id3v1.getComment(), rawProperties);
|
|
||||||
putRawValue(KEY_YEAR_RELEASED, id3v1.getYear(), rawProperties);
|
|
||||||
|
|
||||||
// TODO sort out the genre
|
|
||||||
//putRawValue(MusicModel.KEY_GENRE, id3v1.getGenre());
|
|
||||||
|
|
||||||
// TODO sort out the size
|
|
||||||
//putRawValue(MusicModel.KEY_SIZE, id3v1.getSize());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
AbstractID3v2 id3v2 = mp3File.getID3v2Tag();
|
@Override
|
||||||
if (id3v2 != null)
|
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||||
{
|
Map<String, Serializable> properties) {
|
||||||
putRawValue(KEY_SONG_TITLE, getID3V2Value(id3v2, "TIT2"), rawProperties);
|
putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
|
||||||
putRawValue(KEY_ARTIST, getID3V2Value(id3v2, "TPE1"), rawProperties);
|
putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties);
|
||||||
putRawValue(KEY_ALBUM_TITLE, getID3V2Value(id3v2, "TALB"), rawProperties);
|
putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
|
||||||
putRawValue(KEY_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC"), rawProperties);
|
putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties);
|
||||||
putRawValue(KEY_COMMENT, getID3V2Value(id3v2, "COMM"), rawProperties);
|
putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties);
|
||||||
putRawValue(KEY_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK"), rawProperties);
|
putRawValue(KEY_GENRE, metadata.get(XMPDM.GENRE), properties);
|
||||||
putRawValue(KEY_GENRE, getID3V2Value(id3v2, "TCON"), rawProperties);
|
putRawValue(KEY_YEAR_RELEASED, metadata.get(XMPDM.RELEASE_DATE), properties);
|
||||||
putRawValue(KEY_COMPOSER, getID3V2Value(id3v2, "TCOM"), rawProperties);
|
putRawValue(KEY_COMPOSER, metadata.get(XMPDM.COMPOSER), properties);
|
||||||
|
// TODO lyrics
|
||||||
|
//putRawValue(KEY_LYRICS, getLyrics(), properties);
|
||||||
|
|
||||||
// TODO sort out the lyrics
|
putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
|
||||||
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT"));
|
|
||||||
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT"));
|
|
||||||
}
|
|
||||||
|
|
||||||
AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag();
|
return properties;
|
||||||
if (lyrics3Tag != null)
|
|
||||||
{
|
|
||||||
System.out.println("Lyrics3 tag found.");
|
|
||||||
if (lyrics3Tag instanceof Lyrics3v2)
|
|
||||||
{
|
|
||||||
putRawValue(KEY_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2"), rawProperties);
|
|
||||||
putRawValue(KEY_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1"), rawProperties);
|
|
||||||
putRawValue(KEY_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB"), rawProperties);
|
|
||||||
putRawValue(KEY_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM"), rawProperties);
|
|
||||||
putRawValue(KEY_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT"), rawProperties);
|
|
||||||
putRawValue(KEY_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM"), rawProperties);
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
catch(Exception e)
|
|
||||||
{
|
|
||||||
if (logger.isDebugEnabled())
|
|
||||||
{
|
|
||||||
logger.debug(
|
|
||||||
"MP3 Metadata extraction failed: \n" +
|
|
||||||
" Content: " + reader,
|
|
||||||
e);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
logger.warn(
|
|
||||||
"MP3 Metadata extraction failed (turn on DEBUG for full error): \n" +
|
|
||||||
" Content: " + reader + "\n" +
|
|
||||||
" Failure: " + e.getMessage());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
finally
|
|
||||||
{
|
|
||||||
tempFile.delete();
|
|
||||||
}
|
|
||||||
|
|
||||||
String description = getDescription(rawProperties);
|
|
||||||
if (description != null)
|
|
||||||
{
|
|
||||||
putRawValue(KEY_DESCRIPTION, description, rawProperties);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Done
|
|
||||||
return rawProperties;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate the description
|
* Generate the description
|
||||||
@@ -181,62 +104,27 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
|||||||
* @param props the properties extracted from the file
|
* @param props the properties extracted from the file
|
||||||
* @return the description
|
* @return the description
|
||||||
*/
|
*/
|
||||||
private String getDescription(Map<String, Serializable> props)
|
private String generateDescription(Metadata metadata)
|
||||||
{
|
{
|
||||||
StringBuilder result = new StringBuilder();
|
StringBuilder result = new StringBuilder();
|
||||||
if (props.get(KEY_SONG_TITLE) != null)
|
if (metadata.get(Metadata.TITLE) != null)
|
||||||
{
|
{
|
||||||
result.append(props.get(KEY_SONG_TITLE));
|
result.append(metadata.get(Metadata.TITLE));
|
||||||
if (props.get(KEY_ALBUM_TITLE) != null)
|
if (metadata.get(XMPDM.ALBUM) != null)
|
||||||
{
|
{
|
||||||
result
|
result
|
||||||
.append(" - ")
|
.append(" - ")
|
||||||
.append(props.get(KEY_ALBUM_TITLE));
|
.append(metadata.get(XMPDM.ALBUM));
|
||||||
}
|
}
|
||||||
if (props.get(KEY_ARTIST) != null)
|
if (metadata.get(XMPDM.ARTIST) != null)
|
||||||
{
|
{
|
||||||
result
|
result
|
||||||
.append(" (")
|
.append(" (")
|
||||||
.append(props.get(KEY_ARTIST))
|
.append(metadata.get(XMPDM.ARTIST))
|
||||||
.append(")");
|
.append(")");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return result.toString();
|
return result.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getLyrics3v2Value(Lyrics3v2 lyrics3Tag, String name)
|
|
||||||
{
|
|
||||||
String result = "";
|
|
||||||
Lyrics3v2Field field = lyrics3Tag.getField(name);
|
|
||||||
if (field != null)
|
|
||||||
{
|
|
||||||
AbstractMP3FragmentBody body = field.getBody();
|
|
||||||
if (body != null)
|
|
||||||
{
|
|
||||||
result = (String)body.getObject("Text");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the ID3V2 tag value in a safe way
|
|
||||||
*/
|
|
||||||
private String getID3V2Value(AbstractID3v2 id3v2, String name)
|
|
||||||
{
|
|
||||||
String result = "";
|
|
||||||
|
|
||||||
AbstractID3v2Frame frame = id3v2.getFrame(name);
|
|
||||||
if (frame != null)
|
|
||||||
{
|
|
||||||
AbstractMP3FragmentBody body = frame.getBody();
|
|
||||||
if (body != null)
|
|
||||||
{
|
|
||||||
result = (String)body.getObject("Text");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@@ -29,6 +29,9 @@ import java.util.Map;
|
|||||||
import org.alfresco.repo.content.MimetypeMap;
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
import org.apache.poi.hsmf.MAPIMessage;
|
import org.apache.poi.hsmf.MAPIMessage;
|
||||||
|
import org.apache.tika.metadata.Metadata;
|
||||||
|
import org.apache.tika.parser.Parser;
|
||||||
|
//import org.apache.tika.parser.microsoft.OutlookExtractor; // TODO fix import
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Outlook MAPI format email meta-data extractor extracting the following values:
|
* Outlook MAPI format email meta-data extractor extracting the following values:
|
||||||
@@ -64,9 +67,23 @@ public class MailMetadataExtracter extends TikaPoweredMetadataExtracter
|
|||||||
super(SUPPORTED_MIMETYPES);
|
super(SUPPORTED_MIMETYPES);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Parser getParser() {
|
||||||
|
//return new OutlookExtractor(); // TODO fix import
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||||
|
Map<String, Serializable> properties) {
|
||||||
|
// TODO move things from extractRaw to here
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
||||||
{
|
{
|
||||||
|
// TODO remove this in favour of extractSpecific
|
||||||
final Map<String, Serializable> rawProperties = newRawMap();
|
final Map<String, Serializable> rawProperties = newRawMap();
|
||||||
|
|
||||||
InputStream is = null;
|
InputStream is = null;
|
||||||
|
@@ -18,12 +18,8 @@
|
|||||||
*/
|
*/
|
||||||
package org.alfresco.repo.content.metadata;
|
package org.alfresco.repo.content.metadata;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.alfresco.repo.content.MimetypeMap;
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
|
@@ -39,6 +39,7 @@ import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
|||||||
*
|
*
|
||||||
* Uses Apache Tika
|
* Uses Apache Tika
|
||||||
*
|
*
|
||||||
|
* @author Nick Burch
|
||||||
* @author Neil McErlean
|
* @author Neil McErlean
|
||||||
*/
|
*/
|
||||||
public class PoiMetadataExtracter extends TikaPoweredMetadataExtracter
|
public class PoiMetadataExtracter extends TikaPoweredMetadataExtracter
|
||||||
|
@@ -0,0 +1,73 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This file is part of Alfresco
|
||||||
|
*
|
||||||
|
* Alfresco is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Alfresco is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content.metadata;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.tika.mime.MediaType;
|
||||||
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
|
import org.apache.tika.parser.Parser;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A Metadata Extractor which makes use of the Apache
|
||||||
|
* Tika auto-detection to select the best parser
|
||||||
|
* to extract the metadata from your document.
|
||||||
|
* This will be used for all files which Tika can
|
||||||
|
* handle, but where no other more explicit
|
||||||
|
* extractor is defined.
|
||||||
|
|
||||||
|
* <pre>
|
||||||
|
* <b>author:</b> -- cm:author
|
||||||
|
* <b>title:</b> -- cm:title
|
||||||
|
* <b>subject:</b> -- cm:description
|
||||||
|
* <b>created:</b> -- cm:created
|
||||||
|
* <b>comments:</b>
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* @author Nick Burch
|
||||||
|
*/
|
||||||
|
public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter
|
||||||
|
{
|
||||||
|
protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class);
|
||||||
|
|
||||||
|
public static ArrayList<String> SUPPORTED_MIMETYPES;
|
||||||
|
static {
|
||||||
|
SUPPORTED_MIMETYPES = new ArrayList<String>();
|
||||||
|
AutoDetectParser p = new AutoDetectParser();
|
||||||
|
for(MediaType mt : p.getParsers().keySet()) {
|
||||||
|
SUPPORTED_MIMETYPES.add( mt.toString() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public TikaAutoMetadataExtracter()
|
||||||
|
{
|
||||||
|
super(SUPPORTED_MIMETYPES);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does auto-detection to select the best Tika
|
||||||
|
* Parser.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected Parser getParser() {
|
||||||
|
return new AutoDetectParser();
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,18 @@
|
|||||||
|
#
|
||||||
|
# TikaAutoMetadataExtracter - default mapping
|
||||||
|
#
|
||||||
|
# This is used to map from the Tika and standard namespaces
|
||||||
|
# onto your content model. This will be used for any
|
||||||
|
# content for which an explicit extractor isn't defined,
|
||||||
|
# by using Tika's auto-selection facilities.
|
||||||
|
#
|
||||||
|
# author: Nick Burch
|
||||||
|
|
||||||
|
# Namespaces
|
||||||
|
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||||
|
|
||||||
|
# Mappings
|
||||||
|
author=cm:author
|
||||||
|
title=cm:title
|
||||||
|
description=cm:description
|
||||||
|
created=cm:created
|
@@ -0,0 +1,185 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This file is part of Alfresco
|
||||||
|
*
|
||||||
|
* Alfresco is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Alfresco is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content.metadata;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.alfresco.model.ContentModel;
|
||||||
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
|
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
|
||||||
|
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||||
|
import org.alfresco.service.namespace.QName;
|
||||||
|
import org.apache.tika.metadata.Metadata;
|
||||||
|
import org.apache.tika.mime.MediaType;
|
||||||
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
|
import org.apache.tika.parser.ParseContext;
|
||||||
|
import org.apache.tika.parser.Parser;
|
||||||
|
import org.apache.tika.parser.dwg.DWGParser;
|
||||||
|
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||||
|
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||||
|
import org.apache.tika.parser.mp3.Mp3Parser;
|
||||||
|
import org.apache.tika.parser.odf.OpenDocumentParser;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see TikaAutoMetadataExtracter
|
||||||
|
*
|
||||||
|
* @author Nick Burch
|
||||||
|
*/
|
||||||
|
public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||||
|
{
|
||||||
|
private TikaAutoMetadataExtracter extracter;
|
||||||
|
private static final QName TIKA_MIMETYPE_TEST_PROPERTY =
|
||||||
|
QName.createQName("TikaMimeTypeTestProp");
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception
|
||||||
|
{
|
||||||
|
super.setUp();
|
||||||
|
extracter = new TikaAutoMetadataExtracter();
|
||||||
|
extracter.setDictionaryService(dictionaryService);
|
||||||
|
extracter.register();
|
||||||
|
|
||||||
|
// Attach some extra mappings, using the Tika
|
||||||
|
// metadata keys namespace
|
||||||
|
// These will be tested later
|
||||||
|
HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
|
||||||
|
extracter.getMapping()
|
||||||
|
);
|
||||||
|
|
||||||
|
Set<QName> tlaSet = new HashSet<QName>();
|
||||||
|
tlaSet.add(TIKA_MIMETYPE_TEST_PROPERTY);
|
||||||
|
newMap.put( Metadata.CONTENT_TYPE, tlaSet );
|
||||||
|
|
||||||
|
extracter.setMapping(newMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return Returns the same transformer regardless - it is allowed
|
||||||
|
*/
|
||||||
|
protected MetadataExtracter getExtracter()
|
||||||
|
{
|
||||||
|
return extracter;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSupports() throws Exception
|
||||||
|
{
|
||||||
|
ArrayList<String> mimeTypes = new ArrayList<String>();
|
||||||
|
for (Parser p : new Parser[] {
|
||||||
|
new OfficeParser(), new OpenDocumentParser(),
|
||||||
|
new Mp3Parser(), new OOXMLParser()
|
||||||
|
}) {
|
||||||
|
Set<MediaType> mts = p.getSupportedTypes(new ParseContext());
|
||||||
|
for (MediaType mt : mts) {
|
||||||
|
mimeTypes.add(mt.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String mimetype : mimeTypes)
|
||||||
|
{
|
||||||
|
boolean supports = extracter.isSupported(mimetype);
|
||||||
|
assertTrue("Mimetype should be supported: " + mimetype, supports);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test several different files
|
||||||
|
* Note - doesn't use extractFromMimetype
|
||||||
|
*/
|
||||||
|
public void testSupportedMimetypes() throws Exception
|
||||||
|
{
|
||||||
|
String[] testFiles = new String[] {
|
||||||
|
".doc", ".docx", ".xls", ".xlsx",
|
||||||
|
".ppt", ".pptx",
|
||||||
|
//".vsd", // Not auto-detected properly yet
|
||||||
|
//"2010.dwg", // Not auto-detected properly yet
|
||||||
|
".pdf",
|
||||||
|
".odt"
|
||||||
|
};
|
||||||
|
|
||||||
|
for (String fileBase : testFiles)
|
||||||
|
{
|
||||||
|
String filename = "quick" + fileBase;
|
||||||
|
URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
|
||||||
|
File file = new File(url.getFile());
|
||||||
|
|
||||||
|
// Cheat and ask Tika for the mime type!
|
||||||
|
AutoDetectParser ap = new AutoDetectParser();
|
||||||
|
Metadata metadata = new Metadata();
|
||||||
|
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
|
||||||
|
MediaType mt = ap.getDetector().detect(
|
||||||
|
new BufferedInputStream(new FileInputStream(file)), metadata);
|
||||||
|
String mimetype = mt.toString();
|
||||||
|
|
||||||
|
// Have it processed
|
||||||
|
Map<QName, Serializable> properties = extractFromFile(file, mimetype);
|
||||||
|
|
||||||
|
// check we got something
|
||||||
|
assertFalse("extractFromMimetype should return at least some properties, " +
|
||||||
|
"none found for " + mimetype + " - " + filename,
|
||||||
|
properties.isEmpty());
|
||||||
|
|
||||||
|
// check common metadata
|
||||||
|
testCommonMetadata(mimetype, properties);
|
||||||
|
// check file-type specific metadata
|
||||||
|
testFileSpecificMetadata(mimetype, properties);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean skipAuthorCheck(String mimetype) { return true; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We also provide the creation date - check that
|
||||||
|
*/
|
||||||
|
protected void testFileSpecificMetadata(String mimetype,
|
||||||
|
Map<QName, Serializable> properties) {
|
||||||
|
|
||||||
|
// Check for extra fields
|
||||||
|
// Author isn't there for the OpenDocument ones
|
||||||
|
if(mimetype.indexOf(".oasis.") == -1) {
|
||||||
|
assertEquals(
|
||||||
|
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
|
||||||
|
"Nevin Nollop",
|
||||||
|
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that we can also get things which are standard
|
||||||
|
// Tika metadata properties, if we so choose to
|
||||||
|
assertTrue(
|
||||||
|
"Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||||
|
properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY)
|
||||||
|
);
|
||||||
|
// TODO - uncomment this when TIKA-391 is properly fixed
|
||||||
|
// assertEquals(
|
||||||
|
// "Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||||
|
// mimetype,
|
||||||
|
// DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -35,6 +35,7 @@ import org.apache.commons.logging.Log;
|
|||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.apache.tika.mime.MediaType;
|
import org.apache.tika.mime.MediaType;
|
||||||
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
import org.apache.tika.parser.ParseContext;
|
import org.apache.tika.parser.ParseContext;
|
||||||
import org.apache.tika.parser.Parser;
|
import org.apache.tika.parser.Parser;
|
||||||
import org.apache.tika.sax.BodyContentHandler;
|
import org.apache.tika.sax.BodyContentHandler;
|
||||||
@@ -136,14 +137,13 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Does auto-detection to select the best Tika
|
* Returns the correct Tika Parser to process
|
||||||
* Parser.
|
* the document.
|
||||||
* Implementations can override this if they
|
* If you don't know which you want, use
|
||||||
* know their specific implementations.
|
* {@link TikaAutoMetadataExtracter} which
|
||||||
|
* makes use of the Tika auto-detection.
|
||||||
*/
|
*/
|
||||||
protected Parser getParser() {
|
protected abstract Parser getParser();
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allows implementation specific mappings
|
* Allows implementation specific mappings
|
||||||
|
@@ -1,13 +0,0 @@
|
|||||||
#
|
|
||||||
# TikaPoweredMetadataExtracter - default mapping
|
|
||||||
#
|
|
||||||
# author: Nick Burch
|
|
||||||
|
|
||||||
# Namespaces
|
|
||||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
|
||||||
|
|
||||||
# Mappings
|
|
||||||
author=cm:author
|
|
||||||
title=cm:title
|
|
||||||
description=cm:description
|
|
||||||
created=cm:created
|
|
Reference in New Issue
Block a user