mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Tika for metadata extraction
First pass of converting a few extractors to use Tika rather than 3rd party libraries directly, or use the new style tika structure git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20640 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -18,20 +18,14 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.dwg.DWGParser;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.xml.sax.ContentHandler;
|
||||
|
||||
|
||||
/**
|
||||
@@ -47,64 +41,40 @@ import org.xml.sax.ContentHandler;
|
||||
* <b>lastauthor:</b>
|
||||
* </pre>
|
||||
*
|
||||
* TIKA Note - this has been converted to deep-call into Tika.
|
||||
* This will be replaced with proper calls to Tika at a later date.
|
||||
* Uses Apache Tika
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class DWGMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
public class DWGMetadataExtracter extends TikaPoweredMetadataExtracter
|
||||
{
|
||||
private static final String KEY_AUTHOR = "author";
|
||||
private static final String KEY_COMMENT = "comment";
|
||||
private static final String KEY_DESCRIPTION = "description";
|
||||
private static final String KEY_KEYWORD = "keyword";
|
||||
private static final String KEY_LAST_AUTHOR = "lastAuthor";
|
||||
private static final String KEY_TITLE = "title";
|
||||
|
||||
public static String[] SUPPORTED_MIMETYPES = new String[] {
|
||||
MimetypeMap.MIMETYPE_APP_DWG,
|
||||
MimetypeMap.MIMETYPE_IMG_DWG,
|
||||
"image/x-dwg", // Was used before IANA registration
|
||||
};
|
||||
|
||||
public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
|
||||
new String[] {
|
||||
MimetypeMap.MIMETYPE_APP_DWG,
|
||||
MimetypeMap.MIMETYPE_IMG_DWG,
|
||||
"image/x-dwg", // Was used before IANA registration
|
||||
},
|
||||
new DWGParser()
|
||||
);
|
||||
|
||||
public DWGMetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties) {
|
||||
putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties);
|
||||
putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties);
|
||||
System.err.println(properties);
|
||||
return properties;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
||||
{
|
||||
Map<String, Serializable> rawProperties = newRawMap();
|
||||
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
|
||||
DWGParser dwgParser = new DWGParser();
|
||||
ContentHandler handler = new BodyContentHandler() ;
|
||||
Metadata metadata = new Metadata();
|
||||
ParseContext context = new ParseContext();
|
||||
|
||||
dwgParser.parse(is, handler, metadata, context);
|
||||
|
||||
putRawValue(KEY_AUTHOR, metadata.get(Metadata.AUTHOR), rawProperties);
|
||||
putRawValue(KEY_COMMENT, metadata.get(Metadata.COMMENTS), rawProperties);
|
||||
putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.DESCRIPTION), rawProperties);
|
||||
putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), rawProperties);
|
||||
putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), rawProperties);
|
||||
putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.SUBJECT), rawProperties);
|
||||
putRawValue(KEY_TITLE, metadata.get(Metadata.TITLE), rawProperties);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (IOException e) {}
|
||||
}
|
||||
}
|
||||
// Done
|
||||
return rawProperties;
|
||||
protected Parser getParser() {
|
||||
return new DWGParser();
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user