Tika for metadata extraction

Convert some more metadata extractors to using Tika, and enable the use of 
 the Tika auto-detection parser on any documents without an explicitly
 defined extractor.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20667 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-06-16 14:09:46 +00:00
parent b08d9ff412
commit 0e19812dbc
11 changed files with 354 additions and 184 deletions

View File

@@ -29,6 +29,9 @@ import java.util.Map;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
//import org.apache.tika.parser.microsoft.OutlookExtractor; // TODO fix import
/**
* Outlook MAPI format email meta-data extractor extracting the following values:
@@ -63,10 +66,24 @@ public class MailMetadataExtracter extends TikaPoweredMetadataExtracter
{
super(SUPPORTED_MIMETYPES);
}
@Override
protected Parser getParser() {
//return new OutlookExtractor(); // TODO fix import
return null;
}
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
Map<String, Serializable> properties) {
// TODO move things from extractRaw to here
return properties;
}
@Override
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
// TODO remove this in favour of extractSpecific
final Map<String, Serializable> rawProperties = newRawMap();
InputStream is = null;