diff --git a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java index 32c737dd7c..c8425822b3 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java @@ -27,8 +27,10 @@ import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.Locale; import java.util.Map; +import java.util.Set; import org.alfresco.api.AlfrescoPublicApi; import org.alfresco.repo.content.MimetypeMap; @@ -386,7 +388,7 @@ public abstract class TikaPoweredMetadataExtracter // keys onto their own content model for(String tikaKey : metadata.names()) { - putRawValue(tikaKey, metadata.get(tikaKey), rawProperties); + putRawValue(tikaKey, getMetadataValue(metadata, tikaKey), rawProperties); } // Now, map the common Tika metadata keys onto @@ -395,14 +397,14 @@ public abstract class TikaPoweredMetadataExtracter // to work without needing any changes // The simple ones - putRawValue(KEY_AUTHOR, metadata.get(Metadata.AUTHOR), rawProperties); - putRawValue(KEY_TITLE, metadata.get(Metadata.TITLE), rawProperties); - putRawValue(KEY_COMMENTS, metadata.get(Metadata.COMMENTS), rawProperties); + putRawValue(KEY_AUTHOR, getMetadataValue(metadata, Metadata.AUTHOR), rawProperties); + putRawValue(KEY_TITLE, getMetadataValue(metadata, Metadata.TITLE), rawProperties); + putRawValue(KEY_COMMENTS, getMetadataValue(metadata, Metadata.COMMENTS), rawProperties); // Get the subject and description, despite things not // being nearly as consistent as one might hope - String subject = metadata.get(Metadata.SUBJECT); - String description = metadata.get(Metadata.DESCRIPTION); + String subject = getMetadataValue(metadata, Metadata.SUBJECT); + String description = getMetadataValue(metadata, Metadata.DESCRIPTION); if(subject != null && description != null) { putRawValue(KEY_DESCRIPTION, description, rawProperties); @@ -503,6 +505,31 @@ public abstract class TikaPoweredMetadataExtracter embedder.embed(metadataToEmbed, inputStream, outputStream, null); } + private String getMetadataValue(Metadata metadata, String key) + { + if (metadata.isMultiValued(key)) + { + String[] parts = metadata.getValues(key); + + // use Set to prevent duplicates + Set value = new LinkedHashSet(parts.length); + + for (int i = 0; i < parts.length; i++) + { + value.add(parts[i]); + } + + String valueStr = value.toString(); + + // remove leading/trailing braces [] + return valueStr.substring(1, valueStr.length() - 1); + } + else + { + return metadata.get(key); + } + } + /** * This content handler will capture entries from within * the header of the Tika content XHTML, but ignore the diff --git a/source/test-java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java b/source/test-java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java index 8a4c01b044..9b51e74051 100644 --- a/source/test-java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java +++ b/source/test-java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java @@ -259,7 +259,7 @@ public void testImageVideo() throws Throwable { assertEquals("8 bits", p.get("Data Precision")); // Check regular Tika properties assertEquals(QUICK_TITLE, p.get(Metadata.COMMENT)); - assertEquals("canon-55-250", p.get(Metadata.SUBJECT)); + assertEquals("canon-55-250, moscow-birds, serbor", p.get(Metadata.SUBJECT)); // Check namespace'd Tika properties assertEquals("12.54321", p.get("geo:lat")); assertEquals("-54.1234", p.get("geo:long"));