Tika for metadata extraction

Convert some more metadata extractors to using Tika, and enable the use of 
 the Tika auto-detection parser on any documents without an explicitly
 defined extractor.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20667 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-06-16 14:09:46 +00:00
parent b08d9ff412
commit 0e19812dbc
11 changed files with 354 additions and 184 deletions

View File

@@ -35,6 +35,7 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
@@ -136,14 +137,13 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
}
/**
* Does auto-detection to select the best Tika
* Parser.
* Implementations can override this if they
* know their specific implementations.
* Returns the correct Tika Parser to process
* the document.
* If you don't know which you want, use
* {@link TikaAutoMetadataExtracter} which
* makes use of the Tika auto-detection.
*/
protected Parser getParser() {
return null;
}
protected abstract Parser getParser();
/**
* Allows implementation specific mappings