mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Tika for metadata extraction
Convert some more metadata extractors to using Tika, and enable the use of the Tika auto-detection parser on any documents without an explicitly defined extractor. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20667 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -29,6 +29,9 @@ import java.util.Map;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.apache.poi.hsmf.MAPIMessage;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.Parser;
|
||||
//import org.apache.tika.parser.microsoft.OutlookExtractor; // TODO fix import
|
||||
|
||||
/**
|
||||
* Outlook MAPI format email meta-data extractor extracting the following values:
|
||||
@@ -63,10 +66,24 @@ public class MailMetadataExtracter extends TikaPoweredMetadataExtracter
|
||||
{
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser() {
|
||||
//return new OutlookExtractor(); // TODO fix import
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties) {
|
||||
// TODO move things from extractRaw to here
|
||||
return properties;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
||||
{
|
||||
// TODO remove this in favour of extractSpecific
|
||||
final Map<String, Serializable> rawProperties = newRawMap();
|
||||
|
||||
InputStream is = null;
|
||||
|
Reference in New Issue
Block a user