mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-10-08 14:51:49 +00:00
125603 rmunteanu: Merged 5.1.1 (5.1.1) to 5.1.N (5.1.2) 125484 slanglois: MNT-16155 Update source headers - remove old Copyrights from Java and JSP dource files git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@125781 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
102 lines
3.2 KiB
Java
102 lines
3.2 KiB
Java
package org.alfresco.repo.content.metadata;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.HashSet;
|
|
|
|
import org.alfresco.api.AlfrescoPublicApi;
|
|
import org.alfresco.error.AlfrescoRuntimeException;
|
|
import org.apache.commons.logging.Log;
|
|
import org.apache.commons.logging.LogFactory;
|
|
import org.apache.tika.mime.MediaType;
|
|
import org.apache.tika.parser.ParseContext;
|
|
import org.apache.tika.parser.Parser;
|
|
|
|
/**
|
|
* A Metadata Extractor which makes use of Apache Tika,
|
|
* and allows the selection of the Tika parser to be
|
|
* sprung-in to extract the metadata from your document.
|
|
* This is typically used with custom Tika Parsers.
|
|
|
|
* <pre>
|
|
* <b>author:</b> -- cm:author
|
|
* <b>title:</b> -- cm:title
|
|
* <b>subject:</b> -- cm:description
|
|
* <b>created:</b> -- cm:created
|
|
* <b>comments:</b>
|
|
* <p>geo:lat:</b> -- cm:latitude
|
|
* <p>geo:long:</b> -- cm:longitude
|
|
* </pre>
|
|
*
|
|
* @since 3.4
|
|
* @author Nick Burch
|
|
*/
|
|
@AlfrescoPublicApi
|
|
public class TikaSpringConfiguredMetadataExtracter extends TikaPoweredMetadataExtracter
|
|
{
|
|
protected static Log logger = LogFactory.getLog(TikaSpringConfiguredMetadataExtracter.class);
|
|
|
|
private Parser tikaParser;
|
|
private String tikaParserClassName;
|
|
private Class<? extends Parser> tikaParserClass;
|
|
|
|
/**
|
|
* Injects the name of the Tika parser to use
|
|
* @param className
|
|
*/
|
|
@SuppressWarnings("unchecked")
|
|
public void setTikaParserName(String className)
|
|
{
|
|
tikaParserClassName = className;
|
|
|
|
// Load the class
|
|
try {
|
|
tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName);
|
|
setTikaParser(getParser());
|
|
} catch(ClassNotFoundException e) {
|
|
throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Injects the Tika parser to use
|
|
* @param tikaParser
|
|
*/
|
|
public void setTikaParser(Parser tikaParser)
|
|
{
|
|
this.tikaParser = tikaParser;
|
|
|
|
// Build the mime types, updating the copy our parent
|
|
// holds for us as we go along
|
|
ArrayList<String> mimetypes = new ArrayList<String>();
|
|
for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
|
|
{
|
|
mimetypes.add( mt.toString() );
|
|
}
|
|
super.setSupportedMimetypes(mimetypes);
|
|
}
|
|
|
|
public TikaSpringConfiguredMetadataExtracter()
|
|
{
|
|
super(new HashSet<String>());
|
|
}
|
|
|
|
/**
|
|
* Returns the Tika parser
|
|
*/
|
|
protected Parser getParser()
|
|
{
|
|
// If we were given a whole parser, return it
|
|
if(tikaParser != null)
|
|
return tikaParser;
|
|
|
|
// Otherwise create a new one
|
|
try {
|
|
return tikaParserClass.newInstance();
|
|
} catch (InstantiationException e) {
|
|
throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
|
|
} catch (IllegalAccessException e) {
|
|
throw new AlfrescoRuntimeException("Unable to create specified Parser", e);
|
|
}
|
|
}
|
|
}
|