From b08d9ff4125907fc012029aa5f0df3226cbfe244 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Tue, 15 Jun 2010 12:22:47 +0000 Subject: [PATCH] Make all Tika metadata properties available, as well as existing specific ones Following discussions with Neil, make all the Tika supplied properties available after the extraction, in case users wish to map them in a standard way onto their content model. Per-extractor specific names are still retained too git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20649 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- .../metadata/DWGMetadataExtracter.java | 1 - .../metadata/DWGMetadataExtracterTest.java | 31 +++++++++++++++++++ .../TikaPoweredMetadataExtracter.java | 22 ++++++++++++- 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java index 77cabf33a0..82d3205129 100644 --- a/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java @@ -69,7 +69,6 @@ public class DWGMetadataExtracter extends TikaPoweredMetadataExtracter Map properties) { putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties); putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties); - System.err.println(properties); return properties; } diff --git a/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracterTest.java index e505a1fd4b..28bf031cd6 100644 --- a/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracterTest.java @@ -21,13 +21,17 @@ package org.alfresco.repo.content.metadata; import java.io.File; import java.io.Serializable; import java.net.URL; +import java.util.HashMap; +import java.util.HashSet; import java.util.Map; +import java.util.Set; import org.alfresco.model.ContentModel; import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.transform.AbstractContentTransformerTest; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.namespace.QName; +import org.apache.tika.metadata.Metadata; /** @@ -38,6 +42,8 @@ import org.alfresco.service.namespace.QName; public class DWGMetadataExtracterTest extends AbstractMetadataExtracterTest { private DWGMetadataExtracter extracter; + private static final QName TIKA_LAST_AUTHOR_TEST_PROPERTY = + QName.createQName("TikaLastAuthorTestProp"); @Override public void setUp() throws Exception @@ -46,6 +52,19 @@ public class DWGMetadataExtracterTest extends AbstractMetadataExtracterTest extracter = new DWGMetadataExtracter(); extracter.setDictionaryService(dictionaryService); extracter.register(); + + // Attach some extra mappings, using the Tika + // metadata keys namespace + // These will be tested later + HashMap> newMap = new HashMap>( + extracter.getMapping() + ); + + Set tlaSet = new HashSet(); + tlaSet.add(TIKA_LAST_AUTHOR_TEST_PROPERTY); + newMap.put( Metadata.LAST_AUTHOR, tlaSet ); + + extracter.setMapping(newMap); } /** @@ -100,11 +119,23 @@ public class DWGMetadataExtracterTest extends AbstractMetadataExtracterTest */ protected void testFileSpecificMetadata(String mimetype, Map properties) { + // Check for extra fields assertEquals( "Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype, "Nevin Nollop", DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR))); + + // Ensure that we can also get things which are standard + // Tika metadata properties, if we so choose to + assertTrue( + "Test Property " + TIKA_LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype, + properties.containsKey(TIKA_LAST_AUTHOR_TEST_PROPERTY) + ); + assertEquals( + "Test Property " + TIKA_LAST_AUTHOR_TEST_PROPERTY + " incorrect for mimetype " + mimetype, + "paolon", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_LAST_AUTHOR_TEST_PROPERTY))); } } diff --git a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java index d6d411611e..04b805ebc6 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java @@ -117,6 +117,7 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada * Version which also tries the ISO-8601 formats (in order..), * and similar formats, which Tika makes use of */ + @Override protected Date makeDate(String dateStr) { // Try our formats first, in order for(DateFormat df : this.tikaDateFormats) { @@ -168,11 +169,25 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada parser.parse(is, handler, metadata, context); + // First up, copy all the Tika metadata over + // This allows people to map any of the Tika + // keys onto their own content model + for(String tikaKey : metadata.names()) { + putRawValue(tikaKey, metadata.get(tikaKey), rawProperties); + } + + // Now, map the common Tika metadata keys onto + // the common Alfresco metadata keys. This allows + // existing mapping properties files to continue + // to work without needing any changes + + // The simple ones putRawValue(KEY_AUTHOR, metadata.get(Metadata.AUTHOR), rawProperties); putRawValue(KEY_TITLE, metadata.get(Metadata.TITLE), rawProperties); putRawValue(KEY_COMMENTS, metadata.get(Metadata.COMMENTS), rawProperties); - // Not everything is as consisent about these two as you might hope + // Get the subject and description, despite things not + // being nearly as consistent as one might hope String subject = metadata.get(Metadata.SUBJECT); String description = metadata.get(Metadata.DESCRIPTION); if(subject != null && description != null) { @@ -193,6 +208,11 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada putRawValue(KEY_CREATED, metadata.get(Metadata.DATE), rawProperties); } + // If people created a specific instance + // (eg OfficeMetadataExtractor), then allow that + // instance to map the Tika keys onto its + // existing namespace so that older properties + // files continue to map correctly rawProperties = extractSpecific(metadata, rawProperties); } finally