REPO-1986: Upload Failing due to Metadata Extraction Issue (MNT-17436) - part 2

- part 2 - enable "addTags" to handle configurable list of separators (when using "enableStringTagging" option of "extract-metadata" action) - initial default separators/delimiters => comma, semi-colon & vertical bar (pipe) - also means we can re-enable & fix ContentMetadataExtractorTagMappingTest git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@135061 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-10-08 14:51:49 +00:00 · 2017-02-09 16:04:00 +00:00
parent a6d88afc6e
commit 483d79a548
4 changed files with 171 additions and 52 deletions
--- a/config/quick/quickIPTC3.jpg
+++ b/config/quick/quickIPTC3.jpg
--- a/source/java/org/alfresco/repo/action/executer/ContentMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/action/executer/ContentMetadataExtracter.java
@@ -45,6 +45,7 @@ package org.alfresco.repo.action.executer;

 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -94,8 +95,13 @@ public class ContentMetadataExtracter extends ActionExecuterAbstractBase
    private TaggingService taggingService;
    private MetadataExtracterRegistry metadataExtracterRegistry;
    private boolean carryAspectProperties = true;
+    
+    
    private boolean enableStringTagging = false;
    
+    // Default list of separators (when enableStringTagging is enabled)
+    protected List<String> stringTaggingSeparators = Arrays.asList(",", ";", "\\|");
+    
    public ContentMetadataExtracter()
    {
    }
@@ -164,6 +170,16 @@ public class ContentMetadataExtracter extends ActionExecuterAbstractBase
        this.enableStringTagging = enableStringTagging;
    }

+    /**
+     * List of string separators - note: all will be applied to a given string
+     * 
+     * @param stringTaggingSeparators
+     */
+    public void setStringTaggingSeparators(List<String> stringTaggingSeparators)
+    {
+        this.stringTaggingSeparators = stringTaggingSeparators;
+    }
+
    /**
     * Iterates the values of the taggable property which the metadata
     * extractor should have already attempted to convert values to {@link NodeRef}s.
@@ -182,11 +198,12 @@ public class ContentMetadataExtracter extends ActionExecuterAbstractBase
    protected void addTags(NodeRef actionedUponNodeRef, PropertyDefinition propertyDef, Serializable rawValue)
    {
        List<String> tags = new ArrayList<String>();
+
        if (logger.isDebugEnabled())
        {
-            logger.debug("converting " + rawValue.toString() + " of type " + 
-                    rawValue.getClass().getCanonicalName() + " to tags");
+            logger.debug("converting " + rawValue.toString() + " of type " + rawValue.getClass().getCanonicalName() + " to tags");
        }
+
        if (rawValue instanceof Collection<?>)
        {
            for (Object singleValue : (Collection<?>) rawValue)
@@ -201,16 +218,15 @@ public class ContentMetadataExtracter extends ActionExecuterAbstractBase
                                (String) singleValue);
                        try
                        {
-                            String tagName = (String) nodeService.getProperty((NodeRef) convertedPropertyValue, ContentModel.PROP_NAME);
+                            NodeRef nodeRef = (NodeRef) convertedPropertyValue;
+                            String tagName = (String) nodeService.getProperty(nodeRef, ContentModel.PROP_NAME);
+
                            if (logger.isTraceEnabled())
                            {
-                                logger.trace("found tag '" + tagName + "' from tag nodeRef '" + (String) singleValue + "', " +
-                                        "adding to " + actionedUponNodeRef.toString());
-                            }
-                            if (tagName != null && !tagName.equals(""))
-                            {
-                                tags.add(tagName);
+                                logger.trace("adding string tag name'" + tagName + "' (from tag nodeRef "+nodeRef+") to " + actionedUponNodeRef);
                            }
+
+                            tags.addAll(splitTag(tagName));
                        }
                        catch (InvalidNodeRefException e)
                        {
@@ -223,17 +239,26 @@ public class ContentMetadataExtracter extends ActionExecuterAbstractBase
                    else
                    {
                        // Must be a simple string
+
                        if (logger.isTraceEnabled())
                        {
-                            logger.trace("adding string tag '" + (String) singleValue + "' to " + actionedUponNodeRef.toString());
+                            logger.trace("adding string tag name'" + singleValue + "' to " + actionedUponNodeRef);
                        }
-                        tags.add((String) singleValue);
+
+                        tags.addAll(splitTag((String)singleValue));
                    }
                }
                else if (singleValue instanceof NodeRef)
                {
-                    String tagName = (String) nodeService.getProperty((NodeRef) singleValue, ContentModel.PROP_NAME);
-                    tags.add(tagName);
+                    NodeRef nodeRef = (NodeRef)singleValue;
+                    String tagName = (String) nodeService.getProperty(nodeRef, ContentModel.PROP_NAME);
+
+                    if (logger.isTraceEnabled())
+                    {
+                        logger.trace("adding string tag name'" + tagName + "' (for nodeRef "+nodeRef+") to " + actionedUponNodeRef);
+                    }
+
+                    tags.addAll(splitTag(tagName));
                }
            }
        }
@@ -241,9 +266,15 @@ public class ContentMetadataExtracter extends ActionExecuterAbstractBase
        {
            if (logger.isTraceEnabled())
            {
-                logger.trace("adding tag '" + (String) rawValue + "' to " + actionedUponNodeRef.toString());
+                logger.trace("adding string tag name'" + (String)rawValue + "' to " + actionedUponNodeRef);
            }
-            tags.add((String) rawValue);
+            
+            tags.addAll(splitTag((String)rawValue));
+        }
+
+        if (logger.isDebugEnabled())
+        {
+            logger.debug("adding tags '" + tags + "' to " + actionedUponNodeRef.toString());
        }

        try
@@ -260,6 +291,34 @@ public class ContentMetadataExtracter extends ActionExecuterAbstractBase
        }
    }

+    protected List<String> splitTag(String str)
+    {
+        List<String> result = new ArrayList<>();
+        if ((str != null) && (!str.equals("")))
+        {
+            result.add(str.trim());
+
+            if (stringTaggingSeparators != null)
+            {
+                for (String sep : stringTaggingSeparators)
+                {
+                    List<String> splitTags = new ArrayList<>(result.size());
+                    for (String tag : result)
+                    {
+                        String[] parts = tag.split(sep);
+                        for (String part : parts)
+                        {
+                            splitTags.add(part.trim());
+                        }
+                    }
+                    result = splitTags;
+                }
+            }
+        }
+
+        return result;
+    }
+    
    /**
     * @see org.alfresco.repo.action.executer.ActionExecuter#execute(Action,
     *      NodeRef)
--- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java
@@ -413,6 +413,7 @@ public abstract class TikaPoweredMetadataExtracter
            //  keys onto their own content model
            for(String tikaKey : metadata.names()) 
            {
+                // TODO review this change (part of MNT-15267) - should we really force string concatenation here !?
               putRawValue(tikaKey, getMetadataValue(metadata, tikaKey), rawProperties);
            }
            
--- a/source/test-java/org/alfresco/repo/action/executer/ContentMetadataExtracterTagMappingTest.java
+++ b/source/test-java/org/alfresco/repo/action/executer/ContentMetadataExtracterTagMappingTest.java
@@ -97,6 +97,7 @@ public class ContentMetadataExtracterTagMappingTest extends TestCase

   protected static final String QUICK_FILENAME = "quickIPTC.jpg"; // Keywords separated with comma (,)
   protected static final String QUICK_FILENAME2 = "quickIPTC2.jpg"; // Keywords separated with pipe (|)
+    protected static final String QUICK_FILENAME3 = "quickIPTC3.jpg"; // Keywords separated with semi-colon (;)

   protected static final String QUICK_KEYWORD = "fox";
   protected static final String TAG_1 = "tag one";
@@ -356,9 +357,12 @@ public class ContentMetadataExtracterTagMappingTest extends TestCase
    /**
     * Test execution of mapping strings to tags
     */
-    // TODO ignored until we investigate when/why this regressed - start with MNT-13655 ?
-    public void XtestTagMapping() throws Exception
+    public void testTagMapping() throws Exception
    {
+        // explicitly set here (rather than rely on defaults) in case another test method nullified
+        this.executer = (ContentMetadataExtracter) ctx.getBean("extract-metadata");
+        executer.setStringTaggingSeparators(Arrays.asList(",", ";", "\\|"));
+        
        // Create the folders and documents to be tagged
        NodeRef[] nodes = createTestFolderAndDocument(QUICK_FILENAME);
        NodeRef document = nodes[0];
@@ -375,24 +379,32 @@ public class ContentMetadataExtracterTagMappingTest extends TestCase
                executer.execute(action, document);
                
                // Test extracted properties
+
                assertEquals(ContentMetadataExtracterTest.QUICK_DESCRIPTION, 
                        nodeService.getProperty(document, ContentModel.PROP_DESCRIPTION));
+                
                assertTrue("storeRef tags should contain '" + QUICK_KEYWORD + "'", 
                        taggingService.getTags(storeRef).contains(QUICK_KEYWORD));
-                assertTrue("document's tags should contain '" + QUICK_KEYWORD + "'", 
-                        taggingService.getTags(document).contains(QUICK_KEYWORD));
+                
+                List<String> tags = taggingService.getTags(document);
+                assertTrue("doc tags '"+tags+"' should contain '" + QUICK_KEYWORD + "'", 
+                        tags.contains(QUICK_KEYWORD));
                
                // Test manually added keyword
-                assertTrue("tags should contain '" + TAG_2 + "'", 
-                        taggingService.getTags(document).contains(TAG_2));
+                assertTrue("doc tags '"+tags+"' should contain '" + TAG_2 + "'", 
+                        tags.contains(TAG_2));
+
+                // Test manually added keyword - note: lower-case tag name
+                assertTrue("doc tags '"+tags+"' should contain '" + TAG_3.toLowerCase() + "'",
+                        tags.contains(TAG_3.toLowerCase()));
                
                // Test manually added nodeRef keyword
-                assertTrue("tags should contain '" + TAG_1 + "'", 
-                        taggingService.getTags(document).contains(TAG_1));
+                assertTrue("doc tags '"+tags+"' should contain '" + TAG_1 + "'", 
+                        tags.contains(TAG_1));
                
-                // Test that there are no empty tags created by the non-existent nodeRef
-                assertEquals("tags should contain '" + TAG_1 + "'", 4,
-                        taggingService.getTags(document).size() );
+                // Test that there are no extra tags created by the non-existent nodeRef
+                assertEquals("Unexpected number of doc tags '"+tags+"'", 7,
+                        tags.size());
                
                return null;
            }
@@ -408,6 +420,9 @@ public class ContentMetadataExtracterTagMappingTest extends TestCase
     */
    public void testIgnoreInvalidTag() throws Exception
    {
+        this.executer = (ContentMetadataExtracter) ctx.getBean("extract-metadata");
+        executer.setStringTaggingSeparators(null);
+
        // Create the folders and documents to be tagged
        NodeRef[] nodes = createTestFolderAndDocument(QUICK_FILENAME2);
        NodeRef document = nodes[0];
@@ -428,4 +443,48 @@ public class ContentMetadataExtracterTagMappingTest extends TestCase

        removeTestFolderAndDocument(nodes);
    }
+
+    public void testTagMappingSeparators() throws Exception
+    {
+        // explicitly set here (rather than rely on defaults) in case another test method nullified
+        this.executer = (ContentMetadataExtracter) ctx.getBean("extract-metadata");
+        executer.setStringTaggingSeparators(Arrays.asList(",", ";", "\\|"));
+
+        // IPTC Keywords with comma
+        NodeRef[] nodes = createTestFolderAndDocument(QUICK_FILENAME);
+        extractAndCheckTags(nodes[0], Arrays.asList("fox", "dog", "lazy", "jumping"));
+        removeTestFolderAndDocument(nodes);
+
+        // IPTC Keywords with vertical bar (pipe)
+        nodes = createTestFolderAndDocument(QUICK_FILENAME2);
+        extractAndCheckTags(nodes[0], Arrays.asList("k1", "k2", "k3"));
+        removeTestFolderAndDocument(nodes);
+
+        // IPTC Keywords with semi-colon
+        nodes = createTestFolderAndDocument(QUICK_FILENAME3);
+        extractAndCheckTags(nodes[0], Arrays.asList("keyword1", "keyword2", "keyword3", "keyword4"));
+        removeTestFolderAndDocument(nodes);
+    }
+
+    private void extractAndCheckTags(NodeRef document, List<String> expectedTags)
+    {
+        this.transactionService.getRetryingTransactionHelper().doInTransaction(new RetryingTransactionCallback<Void>(){
+
+            @Override
+            public Void execute() throws Throwable
+            {
+                ActionImpl action = new ActionImpl(document, ID, ContentMetadataExtracter.EXECUTOR_NAME, null);
+                executer.execute(action, document);
+
+                List<String> tags = taggingService.getTags(document);
+
+                for (String expectedTag : expectedTags)
+                {
+                    assertTrue("Expected tag '"+expectedTag+"' not in "+tags, tags.contains(expectedTag));
+                }
+
+                return null;
+            }
+        });
+    }
 }