diff --git a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java index 15b8745ba2..87474ca644 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2014 Alfresco Software Limited. + * Copyright (C) 2005-2016 Alfresco Software Limited. * * This file is part of Alfresco * @@ -23,6 +23,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.Serializable; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Date; import java.util.HashMap; @@ -97,6 +98,7 @@ public abstract class TikaPoweredMetadataExtracter protected static final String KEY_CREATED = "created"; protected static final String KEY_DESCRIPTION = "description"; protected static final String KEY_COMMENTS = "comments"; + protected static final String KEY_TAGS = "dc:subject"; private DateTimeFormatter tikaUTCDateFormater; private DateTimeFormatter tikaDateFormater; @@ -104,6 +106,18 @@ public abstract class TikaPoweredMetadataExtracter private String extractorContext = null; + private String metadataSeparator = ","; // Default separator. + + public String getMetadataSeparator() + { + return metadataSeparator; + } + + public void setMetadataSeparator(String metadataSeparator) + { + this.metadataSeparator = metadataSeparator; + } + /** * Builds up a list of supported mime types by merging * an explicit list with any that Tika also claims to support @@ -400,7 +414,10 @@ public abstract class TikaPoweredMetadataExtracter putRawValue(KEY_AUTHOR, getMetadataValue(metadata, Metadata.AUTHOR), rawProperties); putRawValue(KEY_TITLE, getMetadataValue(metadata, Metadata.TITLE), rawProperties); putRawValue(KEY_COMMENTS, getMetadataValue(metadata, Metadata.COMMENTS), rawProperties); - + + // Tags + putRawValue(KEY_TAGS, getMetadataValues(metadata, KEY_TAGS), rawProperties); + // Get the subject and description, despite things not // being nearly as consistent as one might hope String subject = getMetadataValue(metadata, Metadata.SUBJECT); @@ -504,6 +521,28 @@ public abstract class TikaPoweredMetadataExtracter OutputStream outputStream = writer.getContentOutputStream(); embedder.embed(metadataToEmbed, inputStream, outputStream, null); } + + private Serializable getMetadataValues(Metadata metadata, String key) + { + // Use Set to prevent duplicates. + Set valuesSet = new LinkedHashSet(); + String[] values = metadata.getValues(key); + + for (int i = 0; i < values.length; i++) + { + String[] parts = values[i].split(metadataSeparator); + + for (String subPart : parts) + { + valuesSet.add(subPart.trim()); + } + } + + Object[] objArrayValues = valuesSet.toArray(); + values = Arrays.copyOf(objArrayValues, objArrayValues.length, String[].class); + + return values.length == 0 ? null : (values.length == 1 ? values[0] : values); + } private String getMetadataValue(Metadata metadata, String key) { diff --git a/source/test-java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java b/source/test-java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java index 269016fbd5..1643418c93 100644 --- a/source/test-java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java +++ b/source/test-java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java @@ -22,6 +22,7 @@ import java.io.File; import java.io.Serializable; import java.net.URL; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -274,6 +275,7 @@ public void testImageVideo() throws Throwable { // Check regular Tika properties assertEquals(QUICK_TITLE, p.get(Metadata.COMMENT)); assertEquals("canon-55-250, moscow-birds, serbor", p.get(Metadata.SUBJECT)); + assertTrue(Arrays.equals(new String[] { "canon-55-250", "moscow-birds", "serbor" }, (String[]) p.get("dc:subject"))); // Check namespace'd Tika properties assertEquals("12.54321", p.get("geo:lat")); assertEquals("-54.1234", p.get("geo:long"));