Make all Tika metadata properties available, as well as existing specific ones

Following discussions with Neil, make all the Tika supplied properties available after the extraction, in case users wish to map them in a standard way onto their content model. Per-extractor specific names are still retained too


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20649 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-06-15 12:22:47 +00:00
parent 63b2f5983a
commit b08d9ff412
3 changed files with 52 additions and 2 deletions

View File

@@ -69,7 +69,6 @@ public class DWGMetadataExtracter extends TikaPoweredMetadataExtracter
Map<String, Serializable> properties) { Map<String, Serializable> properties) {
putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties); putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties);
putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties); putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties);
System.err.println(properties);
return properties; return properties;
} }

View File

@@ -21,13 +21,17 @@ package org.alfresco.repo.content.metadata;
import java.io.File; import java.io.File;
import java.io.Serializable; import java.io.Serializable;
import java.net.URL; import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map; import java.util.Map;
import java.util.Set;
import org.alfresco.model.ContentModel; import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.transform.AbstractContentTransformerTest; import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.namespace.QName; import org.alfresco.service.namespace.QName;
import org.apache.tika.metadata.Metadata;
/** /**
@@ -38,6 +42,8 @@ import org.alfresco.service.namespace.QName;
public class DWGMetadataExtracterTest extends AbstractMetadataExtracterTest public class DWGMetadataExtracterTest extends AbstractMetadataExtracterTest
{ {
private DWGMetadataExtracter extracter; private DWGMetadataExtracter extracter;
private static final QName TIKA_LAST_AUTHOR_TEST_PROPERTY =
QName.createQName("TikaLastAuthorTestProp");
@Override @Override
public void setUp() throws Exception public void setUp() throws Exception
@@ -46,6 +52,19 @@ public class DWGMetadataExtracterTest extends AbstractMetadataExtracterTest
extracter = new DWGMetadataExtracter(); extracter = new DWGMetadataExtracter();
extracter.setDictionaryService(dictionaryService); extracter.setDictionaryService(dictionaryService);
extracter.register(); extracter.register();
// Attach some extra mappings, using the Tika
// metadata keys namespace
// These will be tested later
HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
extracter.getMapping()
);
Set<QName> tlaSet = new HashSet<QName>();
tlaSet.add(TIKA_LAST_AUTHOR_TEST_PROPERTY);
newMap.put( Metadata.LAST_AUTHOR, tlaSet );
extracter.setMapping(newMap);
} }
/** /**
@@ -100,11 +119,23 @@ public class DWGMetadataExtracterTest extends AbstractMetadataExtracterTest
*/ */
protected void testFileSpecificMetadata(String mimetype, protected void testFileSpecificMetadata(String mimetype,
Map<QName, Serializable> properties) { Map<QName, Serializable> properties) {
// Check for extra fields // Check for extra fields
assertEquals( assertEquals(
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype, "Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
"Nevin Nollop", "Nevin Nollop",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR))); DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
// Ensure that we can also get things which are standard
// Tika metadata properties, if we so choose to
assertTrue(
"Test Property " + TIKA_LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype,
properties.containsKey(TIKA_LAST_AUTHOR_TEST_PROPERTY)
);
assertEquals(
"Test Property " + TIKA_LAST_AUTHOR_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
"paolon",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_LAST_AUTHOR_TEST_PROPERTY)));
} }
} }

View File

@@ -117,6 +117,7 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
* Version which also tries the ISO-8601 formats (in order..), * Version which also tries the ISO-8601 formats (in order..),
* and similar formats, which Tika makes use of * and similar formats, which Tika makes use of
*/ */
@Override
protected Date makeDate(String dateStr) { protected Date makeDate(String dateStr) {
// Try our formats first, in order // Try our formats first, in order
for(DateFormat df : this.tikaDateFormats) { for(DateFormat df : this.tikaDateFormats) {
@@ -168,11 +169,25 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
parser.parse(is, handler, metadata, context); parser.parse(is, handler, metadata, context);
// First up, copy all the Tika metadata over
// This allows people to map any of the Tika
// keys onto their own content model
for(String tikaKey : metadata.names()) {
putRawValue(tikaKey, metadata.get(tikaKey), rawProperties);
}
// Now, map the common Tika metadata keys onto
// the common Alfresco metadata keys. This allows
// existing mapping properties files to continue
// to work without needing any changes
// The simple ones
putRawValue(KEY_AUTHOR, metadata.get(Metadata.AUTHOR), rawProperties); putRawValue(KEY_AUTHOR, metadata.get(Metadata.AUTHOR), rawProperties);
putRawValue(KEY_TITLE, metadata.get(Metadata.TITLE), rawProperties); putRawValue(KEY_TITLE, metadata.get(Metadata.TITLE), rawProperties);
putRawValue(KEY_COMMENTS, metadata.get(Metadata.COMMENTS), rawProperties); putRawValue(KEY_COMMENTS, metadata.get(Metadata.COMMENTS), rawProperties);
// Not everything is as consisent about these two as you might hope // Get the subject and description, despite things not
// being nearly as consistent as one might hope
String subject = metadata.get(Metadata.SUBJECT); String subject = metadata.get(Metadata.SUBJECT);
String description = metadata.get(Metadata.DESCRIPTION); String description = metadata.get(Metadata.DESCRIPTION);
if(subject != null && description != null) { if(subject != null && description != null) {
@@ -193,6 +208,11 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
putRawValue(KEY_CREATED, metadata.get(Metadata.DATE), rawProperties); putRawValue(KEY_CREATED, metadata.get(Metadata.DATE), rawProperties);
} }
// If people created a specific instance
// (eg OfficeMetadataExtractor), then allow that
// instance to map the Tika keys onto its
// existing namespace so that older properties
// files continue to map correctly
rawProperties = extractSpecific(metadata, rawProperties); rawProperties = extractSpecific(metadata, rawProperties);
} }
finally finally