mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Tika for metadata extraction
Convert some more metadata extractors to using Tika, and enable the use of the Tika auto-detection parser on any documents without an explicitly defined extractor. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20667 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -200,13 +200,15 @@
|
||||
</bean>
|
||||
|
||||
<!-- Content Metadata Extractors -->
|
||||
<!-- The last one listed for any mimetype will be used if available -->
|
||||
<!-- As such, the Tika auto-detect fallback should be listed first -->
|
||||
<bean id="extracter.TikaAuto" class="org.alfresco.repo.content.metadata.TikaAutoMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.PDFBox" class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.Poi" class="org.alfresco.repo.content.metadata.PoiMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.Office" class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.Mail" class="org.alfresco.repo.content.metadata.MailMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.Html" class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<!-- Unsupported experimental extractor commented out -->
|
||||
<!-- <bean id="extracter.MP3" class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" parent="baseMetadataExtracter" /> -->
|
||||
<bean id="extracter.MP3" class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.OpenDocument" class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.DWG" class="org.alfresco.repo.content.metadata.DWGMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.RFC822" class="org.alfresco.repo.content.metadata.RFC822MetadataExtracter" parent="baseMetadataExtracter" >
|
||||
|
@@ -199,6 +199,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
|
||||
* @return Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
|
||||
*
|
||||
* @see #isSupported(String)
|
||||
@@ -209,10 +210,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the policy to use when existing values are encountered. Depending on how the extracer
|
||||
* Set the policy to use when existing values are encountered. Depending on how the extractor
|
||||
* is called, this may not be relevant, i.e an empty map of existing properties may be passed
|
||||
* in by the client code, which may follow its own overwrite strategy.
|
||||
*
|
||||
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
|
||||
* @param overwritePolicy the policy to apply when there are existing system properties
|
||||
*/
|
||||
public void setOverwritePolicy(OverwritePolicy overwritePolicy)
|
||||
@@ -221,10 +223,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the policy to use when existing values are encountered. Depending on how the extracer
|
||||
* Set the policy to use when existing values are encountered. Depending on how the extractor
|
||||
* is called, this may not be relevant, i.e an empty map of existing properties may be passed
|
||||
* in by the client code, which may follow its own overwrite strategy.
|
||||
*
|
||||
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
|
||||
* @param overwritePolicyStr the policy to apply when there are existing system properties
|
||||
*/
|
||||
public void setOverwritePolicy(String overwritePolicyStr)
|
||||
|
@@ -18,23 +18,15 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
import org.farng.mp3.AbstractMP3FragmentBody;
|
||||
import org.farng.mp3.MP3File;
|
||||
import org.farng.mp3.id3.AbstractID3v2;
|
||||
import org.farng.mp3.id3.AbstractID3v2Frame;
|
||||
import org.farng.mp3.id3.ID3v1;
|
||||
import org.farng.mp3.lyrics3.AbstractLyrics3;
|
||||
import org.farng.mp3.lyrics3.Lyrics3v2;
|
||||
import org.farng.mp3.lyrics3.Lyrics3v2Field;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.XMPDM;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.mp3.Mp3Parser;
|
||||
|
||||
/**
|
||||
* Extracts the following values from MP3 files:
|
||||
@@ -51,13 +43,15 @@ import org.farng.mp3.lyrics3.Lyrics3v2Field;
|
||||
* <b>lyrics:</b> -- {music}lyrics
|
||||
* </pre>
|
||||
*
|
||||
* TIKA Note - title and author go in metadata, but much of the
|
||||
* rest is only in the text. Some of the ID3v2 parts
|
||||
* (composer, lyrics) are not yet implemented.
|
||||
* TODO Get hold of a mp3 file with some lyrics in it, so we
|
||||
* can contribute the patch to Tika
|
||||
*
|
||||
* Uses Apache Tika
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author Roy Wetherall
|
||||
*/
|
||||
public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter
|
||||
{
|
||||
private static final String KEY_SONG_TITLE = "songTitle";
|
||||
private static final String KEY_ALBUM_TITLE = "albumTitle";
|
||||
@@ -70,110 +64,39 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
private static final String KEY_COMPOSER = "composer";
|
||||
private static final String KEY_LYRICS = "lyrics";
|
||||
|
||||
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_MP3 };
|
||||
public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
|
||||
new String[] { MimetypeMap.MIMETYPE_MP3 },
|
||||
new Mp3Parser()
|
||||
);
|
||||
|
||||
public MP3MetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
||||
{
|
||||
Map<String, Serializable> rawProperties = newRawMap();
|
||||
|
||||
// Create a temp file
|
||||
File tempFile = TempFileProvider.createTempFile("MP3MetadataExtracter_", ".tmp");
|
||||
try
|
||||
{
|
||||
reader.getContent(tempFile);
|
||||
|
||||
// Create the MP3 object from the file
|
||||
// Open it read only as we won't make any changes
|
||||
MP3File mp3File = new MP3File(tempFile, false);
|
||||
|
||||
ID3v1 id3v1 = mp3File.getID3v1Tag();
|
||||
if (id3v1 != null)
|
||||
{
|
||||
putRawValue(KEY_ALBUM_TITLE, id3v1.getAlbum(), rawProperties);
|
||||
putRawValue(KEY_SONG_TITLE, id3v1.getTitle(), rawProperties);
|
||||
putRawValue(KEY_ARTIST, id3v1.getArtist(), rawProperties);
|
||||
putRawValue(KEY_COMMENT, id3v1.getComment(), rawProperties);
|
||||
putRawValue(KEY_YEAR_RELEASED, id3v1.getYear(), rawProperties);
|
||||
|
||||
// TODO sort out the genre
|
||||
//putRawValue(MusicModel.KEY_GENRE, id3v1.getGenre());
|
||||
|
||||
// TODO sort out the size
|
||||
//putRawValue(MusicModel.KEY_SIZE, id3v1.getSize());
|
||||
protected Parser getParser() {
|
||||
return new Mp3Parser();
|
||||
}
|
||||
|
||||
AbstractID3v2 id3v2 = mp3File.getID3v2Tag();
|
||||
if (id3v2 != null)
|
||||
{
|
||||
putRawValue(KEY_SONG_TITLE, getID3V2Value(id3v2, "TIT2"), rawProperties);
|
||||
putRawValue(KEY_ARTIST, getID3V2Value(id3v2, "TPE1"), rawProperties);
|
||||
putRawValue(KEY_ALBUM_TITLE, getID3V2Value(id3v2, "TALB"), rawProperties);
|
||||
putRawValue(KEY_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC"), rawProperties);
|
||||
putRawValue(KEY_COMMENT, getID3V2Value(id3v2, "COMM"), rawProperties);
|
||||
putRawValue(KEY_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK"), rawProperties);
|
||||
putRawValue(KEY_GENRE, getID3V2Value(id3v2, "TCON"), rawProperties);
|
||||
putRawValue(KEY_COMPOSER, getID3V2Value(id3v2, "TCOM"), rawProperties);
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties) {
|
||||
putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
|
||||
putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties);
|
||||
putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
|
||||
putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties);
|
||||
putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties);
|
||||
putRawValue(KEY_GENRE, metadata.get(XMPDM.GENRE), properties);
|
||||
putRawValue(KEY_YEAR_RELEASED, metadata.get(XMPDM.RELEASE_DATE), properties);
|
||||
putRawValue(KEY_COMPOSER, metadata.get(XMPDM.COMPOSER), properties);
|
||||
// TODO lyrics
|
||||
//putRawValue(KEY_LYRICS, getLyrics(), properties);
|
||||
|
||||
// TODO sort out the lyrics
|
||||
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT"));
|
||||
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT"));
|
||||
}
|
||||
putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
|
||||
|
||||
AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag();
|
||||
if (lyrics3Tag != null)
|
||||
{
|
||||
System.out.println("Lyrics3 tag found.");
|
||||
if (lyrics3Tag instanceof Lyrics3v2)
|
||||
{
|
||||
putRawValue(KEY_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2"), rawProperties);
|
||||
putRawValue(KEY_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1"), rawProperties);
|
||||
putRawValue(KEY_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB"), rawProperties);
|
||||
putRawValue(KEY_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM"), rawProperties);
|
||||
putRawValue(KEY_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT"), rawProperties);
|
||||
putRawValue(KEY_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM"), rawProperties);
|
||||
return properties;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
catch(Exception e)
|
||||
{
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug(
|
||||
"MP3 Metadata extraction failed: \n" +
|
||||
" Content: " + reader,
|
||||
e);
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.warn(
|
||||
"MP3 Metadata extraction failed (turn on DEBUG for full error): \n" +
|
||||
" Content: " + reader + "\n" +
|
||||
" Failure: " + e.getMessage());
|
||||
}
|
||||
|
||||
}
|
||||
finally
|
||||
{
|
||||
tempFile.delete();
|
||||
}
|
||||
|
||||
String description = getDescription(rawProperties);
|
||||
if (description != null)
|
||||
{
|
||||
putRawValue(KEY_DESCRIPTION, description, rawProperties);
|
||||
}
|
||||
|
||||
// Done
|
||||
return rawProperties;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Generate the description
|
||||
@@ -181,62 +104,27 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
* @param props the properties extracted from the file
|
||||
* @return the description
|
||||
*/
|
||||
private String getDescription(Map<String, Serializable> props)
|
||||
private String generateDescription(Metadata metadata)
|
||||
{
|
||||
StringBuilder result = new StringBuilder();
|
||||
if (props.get(KEY_SONG_TITLE) != null)
|
||||
if (metadata.get(Metadata.TITLE) != null)
|
||||
{
|
||||
result.append(props.get(KEY_SONG_TITLE));
|
||||
if (props.get(KEY_ALBUM_TITLE) != null)
|
||||
result.append(metadata.get(Metadata.TITLE));
|
||||
if (metadata.get(XMPDM.ALBUM) != null)
|
||||
{
|
||||
result
|
||||
.append(" - ")
|
||||
.append(props.get(KEY_ALBUM_TITLE));
|
||||
.append(metadata.get(XMPDM.ALBUM));
|
||||
}
|
||||
if (props.get(KEY_ARTIST) != null)
|
||||
if (metadata.get(XMPDM.ARTIST) != null)
|
||||
{
|
||||
result
|
||||
.append(" (")
|
||||
.append(props.get(KEY_ARTIST))
|
||||
.append(metadata.get(XMPDM.ARTIST))
|
||||
.append(")");
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
private String getLyrics3v2Value(Lyrics3v2 lyrics3Tag, String name)
|
||||
{
|
||||
String result = "";
|
||||
Lyrics3v2Field field = lyrics3Tag.getField(name);
|
||||
if (field != null)
|
||||
{
|
||||
AbstractMP3FragmentBody body = field.getBody();
|
||||
if (body != null)
|
||||
{
|
||||
result = (String)body.getObject("Text");
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the ID3V2 tag value in a safe way
|
||||
*/
|
||||
private String getID3V2Value(AbstractID3v2 id3v2, String name)
|
||||
{
|
||||
String result = "";
|
||||
|
||||
AbstractID3v2Frame frame = id3v2.getFrame(name);
|
||||
if (frame != null)
|
||||
{
|
||||
AbstractMP3FragmentBody body = frame.getBody();
|
||||
if (body != null)
|
||||
{
|
||||
result = (String)body.getObject("Text");
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
@@ -29,6 +29,9 @@ import java.util.Map;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.apache.poi.hsmf.MAPIMessage;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.Parser;
|
||||
//import org.apache.tika.parser.microsoft.OutlookExtractor; // TODO fix import
|
||||
|
||||
/**
|
||||
* Outlook MAPI format email meta-data extractor extracting the following values:
|
||||
@@ -64,9 +67,23 @@ public class MailMetadataExtracter extends TikaPoweredMetadataExtracter
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser() {
|
||||
//return new OutlookExtractor(); // TODO fix import
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties) {
|
||||
// TODO move things from extractRaw to here
|
||||
return properties;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
||||
{
|
||||
// TODO remove this in favour of extractSpecific
|
||||
final Map<String, Serializable> rawProperties = newRawMap();
|
||||
|
||||
InputStream is = null;
|
||||
|
@@ -18,12 +18,8 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
|
@@ -39,6 +39,7 @@ import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
*
|
||||
* Uses Apache Tika
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author Neil McErlean
|
||||
*/
|
||||
public class PoiMetadataExtracter extends TikaPoweredMetadataExtracter
|
||||
|
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.Parser;
|
||||
|
||||
/**
|
||||
* A Metadata Extractor which makes use of the Apache
|
||||
* Tika auto-detection to select the best parser
|
||||
* to extract the metadata from your document.
|
||||
* This will be used for all files which Tika can
|
||||
* handle, but where no other more explicit
|
||||
* extractor is defined.
|
||||
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>subject:</b> -- cm:description
|
||||
* <b>created:</b> -- cm:created
|
||||
* <b>comments:</b>
|
||||
* </pre>
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter
|
||||
{
|
||||
protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class);
|
||||
|
||||
public static ArrayList<String> SUPPORTED_MIMETYPES;
|
||||
static {
|
||||
SUPPORTED_MIMETYPES = new ArrayList<String>();
|
||||
AutoDetectParser p = new AutoDetectParser();
|
||||
for(MediaType mt : p.getParsers().keySet()) {
|
||||
SUPPORTED_MIMETYPES.add( mt.toString() );
|
||||
}
|
||||
}
|
||||
|
||||
public TikaAutoMetadataExtracter()
|
||||
{
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does auto-detection to select the best Tika
|
||||
* Parser.
|
||||
*/
|
||||
@Override
|
||||
protected Parser getParser() {
|
||||
return new AutoDetectParser();
|
||||
}
|
||||
}
|
@@ -0,0 +1,18 @@
|
||||
#
|
||||
# TikaAutoMetadataExtracter - default mapping
|
||||
#
|
||||
# This is used to map from the Tika and standard namespaces
|
||||
# onto your content model. This will be used for any
|
||||
# content for which an explicit extractor isn't defined,
|
||||
# by using Tika's auto-selection facilities.
|
||||
#
|
||||
# author: Nick Burch
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
description=cm:description
|
||||
created=cm:created
|
@@ -0,0 +1,185 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.Serializable;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
|
||||
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.dwg.DWGParser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.apache.tika.parser.mp3.Mp3Parser;
|
||||
import org.apache.tika.parser.odf.OpenDocumentParser;
|
||||
|
||||
|
||||
/**
|
||||
* @see TikaAutoMetadataExtracter
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private TikaAutoMetadataExtracter extracter;
|
||||
private static final QName TIKA_MIMETYPE_TEST_PROPERTY =
|
||||
QName.createQName("TikaMimeTypeTestProp");
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception
|
||||
{
|
||||
super.setUp();
|
||||
extracter = new TikaAutoMetadataExtracter();
|
||||
extracter.setDictionaryService(dictionaryService);
|
||||
extracter.register();
|
||||
|
||||
// Attach some extra mappings, using the Tika
|
||||
// metadata keys namespace
|
||||
// These will be tested later
|
||||
HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
|
||||
extracter.getMapping()
|
||||
);
|
||||
|
||||
Set<QName> tlaSet = new HashSet<QName>();
|
||||
tlaSet.add(TIKA_MIMETYPE_TEST_PROPERTY);
|
||||
newMap.put( Metadata.CONTENT_TYPE, tlaSet );
|
||||
|
||||
extracter.setMapping(newMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the same transformer regardless - it is allowed
|
||||
*/
|
||||
protected MetadataExtracter getExtracter()
|
||||
{
|
||||
return extracter;
|
||||
}
|
||||
|
||||
public void testSupports() throws Exception
|
||||
{
|
||||
ArrayList<String> mimeTypes = new ArrayList<String>();
|
||||
for (Parser p : new Parser[] {
|
||||
new OfficeParser(), new OpenDocumentParser(),
|
||||
new Mp3Parser(), new OOXMLParser()
|
||||
}) {
|
||||
Set<MediaType> mts = p.getSupportedTypes(new ParseContext());
|
||||
for (MediaType mt : mts) {
|
||||
mimeTypes.add(mt.toString());
|
||||
}
|
||||
}
|
||||
|
||||
for (String mimetype : mimeTypes)
|
||||
{
|
||||
boolean supports = extracter.isSupported(mimetype);
|
||||
assertTrue("Mimetype should be supported: " + mimetype, supports);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test several different files
|
||||
* Note - doesn't use extractFromMimetype
|
||||
*/
|
||||
public void testSupportedMimetypes() throws Exception
|
||||
{
|
||||
String[] testFiles = new String[] {
|
||||
".doc", ".docx", ".xls", ".xlsx",
|
||||
".ppt", ".pptx",
|
||||
//".vsd", // Not auto-detected properly yet
|
||||
//"2010.dwg", // Not auto-detected properly yet
|
||||
".pdf",
|
||||
".odt"
|
||||
};
|
||||
|
||||
for (String fileBase : testFiles)
|
||||
{
|
||||
String filename = "quick" + fileBase;
|
||||
URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
|
||||
File file = new File(url.getFile());
|
||||
|
||||
// Cheat and ask Tika for the mime type!
|
||||
AutoDetectParser ap = new AutoDetectParser();
|
||||
Metadata metadata = new Metadata();
|
||||
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
|
||||
MediaType mt = ap.getDetector().detect(
|
||||
new BufferedInputStream(new FileInputStream(file)), metadata);
|
||||
String mimetype = mt.toString();
|
||||
|
||||
// Have it processed
|
||||
Map<QName, Serializable> properties = extractFromFile(file, mimetype);
|
||||
|
||||
// check we got something
|
||||
assertFalse("extractFromMimetype should return at least some properties, " +
|
||||
"none found for " + mimetype + " - " + filename,
|
||||
properties.isEmpty());
|
||||
|
||||
// check common metadata
|
||||
testCommonMetadata(mimetype, properties);
|
||||
// check file-type specific metadata
|
||||
testFileSpecificMetadata(mimetype, properties);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean skipAuthorCheck(String mimetype) { return true; }
|
||||
|
||||
/**
|
||||
* We also provide the creation date - check that
|
||||
*/
|
||||
protected void testFileSpecificMetadata(String mimetype,
|
||||
Map<QName, Serializable> properties) {
|
||||
|
||||
// Check for extra fields
|
||||
// Author isn't there for the OpenDocument ones
|
||||
if(mimetype.indexOf(".oasis.") == -1) {
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
|
||||
"Nevin Nollop",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
|
||||
}
|
||||
|
||||
// Ensure that we can also get things which are standard
|
||||
// Tika metadata properties, if we so choose to
|
||||
assertTrue(
|
||||
"Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY)
|
||||
);
|
||||
// TODO - uncomment this when TIKA-391 is properly fixed
|
||||
// assertEquals(
|
||||
// "Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
// mimetype,
|
||||
// DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
|
||||
}
|
||||
|
||||
}
|
@@ -35,6 +35,7 @@ import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
@@ -136,14 +137,13 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
||||
}
|
||||
|
||||
/**
|
||||
* Does auto-detection to select the best Tika
|
||||
* Parser.
|
||||
* Implementations can override this if they
|
||||
* know their specific implementations.
|
||||
* Returns the correct Tika Parser to process
|
||||
* the document.
|
||||
* If you don't know which you want, use
|
||||
* {@link TikaAutoMetadataExtracter} which
|
||||
* makes use of the Tika auto-detection.
|
||||
*/
|
||||
protected Parser getParser() {
|
||||
return null;
|
||||
}
|
||||
protected abstract Parser getParser();
|
||||
|
||||
/**
|
||||
* Allows implementation specific mappings
|
||||
|
@@ -1,13 +0,0 @@
|
||||
#
|
||||
# TikaPoweredMetadataExtracter - default mapping
|
||||
#
|
||||
# author: Nick Burch
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
description=cm:description
|
||||
created=cm:created
|
Reference in New Issue
Block a user