mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Make all Tika metadata properties available, as well as existing specific ones
Following discussions with Neil, make all the Tika supplied properties available after the extraction, in case users wish to map them in a standard way onto their content model. Per-extractor specific names are still retained too git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20649 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -69,7 +69,6 @@ public class DWGMetadataExtracter extends TikaPoweredMetadataExtracter
|
||||
Map<String, Serializable> properties) {
|
||||
putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties);
|
||||
putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties);
|
||||
System.err.println(properties);
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
@@ -21,13 +21,17 @@ package org.alfresco.repo.content.metadata;
|
||||
import java.io.File;
|
||||
import java.io.Serializable;
|
||||
import java.net.URL;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
|
||||
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
|
||||
|
||||
/**
|
||||
@@ -38,6 +42,8 @@ import org.alfresco.service.namespace.QName;
|
||||
public class DWGMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private DWGMetadataExtracter extracter;
|
||||
private static final QName TIKA_LAST_AUTHOR_TEST_PROPERTY =
|
||||
QName.createQName("TikaLastAuthorTestProp");
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception
|
||||
@@ -46,6 +52,19 @@ public class DWGMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
extracter = new DWGMetadataExtracter();
|
||||
extracter.setDictionaryService(dictionaryService);
|
||||
extracter.register();
|
||||
|
||||
// Attach some extra mappings, using the Tika
|
||||
// metadata keys namespace
|
||||
// These will be tested later
|
||||
HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
|
||||
extracter.getMapping()
|
||||
);
|
||||
|
||||
Set<QName> tlaSet = new HashSet<QName>();
|
||||
tlaSet.add(TIKA_LAST_AUTHOR_TEST_PROPERTY);
|
||||
newMap.put( Metadata.LAST_AUTHOR, tlaSet );
|
||||
|
||||
extracter.setMapping(newMap);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -100,11 +119,23 @@ public class DWGMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
*/
|
||||
protected void testFileSpecificMetadata(String mimetype,
|
||||
Map<QName, Serializable> properties) {
|
||||
|
||||
// Check for extra fields
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
|
||||
"Nevin Nollop",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
|
||||
|
||||
// Ensure that we can also get things which are standard
|
||||
// Tika metadata properties, if we so choose to
|
||||
assertTrue(
|
||||
"Test Property " + TIKA_LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
properties.containsKey(TIKA_LAST_AUTHOR_TEST_PROPERTY)
|
||||
);
|
||||
assertEquals(
|
||||
"Test Property " + TIKA_LAST_AUTHOR_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
"paolon",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_LAST_AUTHOR_TEST_PROPERTY)));
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -117,6 +117,7 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
||||
* Version which also tries the ISO-8601 formats (in order..),
|
||||
* and similar formats, which Tika makes use of
|
||||
*/
|
||||
@Override
|
||||
protected Date makeDate(String dateStr) {
|
||||
// Try our formats first, in order
|
||||
for(DateFormat df : this.tikaDateFormats) {
|
||||
@@ -168,11 +169,25 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
||||
|
||||
parser.parse(is, handler, metadata, context);
|
||||
|
||||
// First up, copy all the Tika metadata over
|
||||
// This allows people to map any of the Tika
|
||||
// keys onto their own content model
|
||||
for(String tikaKey : metadata.names()) {
|
||||
putRawValue(tikaKey, metadata.get(tikaKey), rawProperties);
|
||||
}
|
||||
|
||||
// Now, map the common Tika metadata keys onto
|
||||
// the common Alfresco metadata keys. This allows
|
||||
// existing mapping properties files to continue
|
||||
// to work without needing any changes
|
||||
|
||||
// The simple ones
|
||||
putRawValue(KEY_AUTHOR, metadata.get(Metadata.AUTHOR), rawProperties);
|
||||
putRawValue(KEY_TITLE, metadata.get(Metadata.TITLE), rawProperties);
|
||||
putRawValue(KEY_COMMENTS, metadata.get(Metadata.COMMENTS), rawProperties);
|
||||
|
||||
// Not everything is as consisent about these two as you might hope
|
||||
// Get the subject and description, despite things not
|
||||
// being nearly as consistent as one might hope
|
||||
String subject = metadata.get(Metadata.SUBJECT);
|
||||
String description = metadata.get(Metadata.DESCRIPTION);
|
||||
if(subject != null && description != null) {
|
||||
@@ -193,6 +208,11 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
||||
putRawValue(KEY_CREATED, metadata.get(Metadata.DATE), rawProperties);
|
||||
}
|
||||
|
||||
// If people created a specific instance
|
||||
// (eg OfficeMetadataExtractor), then allow that
|
||||
// instance to map the Tika keys onto its
|
||||
// existing namespace so that older properties
|
||||
// files continue to map correctly
|
||||
rawProperties = extractSpecific(metadata, rawProperties);
|
||||
}
|
||||
finally
|
||||
|
Reference in New Issue
Block a user