MNT-16709 : Metadata extraction on 200MB PDF file causes large heap utilization

- added concurrent extraction limit - added max document size limit git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@131709 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-10-08 14:51:49 +00:00 · 2016-10-24 12:13:50 +00:00
parent 2b4db66fbe
commit c95cbaccd9
8 changed files with 282 additions and 94 deletions
--- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
@@ -45,13 +45,16 @@ import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
-import java.util.concurrent.FutureTask;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
+import java.util.concurrent.FutureTask;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicInteger;

-import org.alfresco.api.AlfrescoPublicApi;
-import org.alfresco.error.AlfrescoRuntimeException;
-import org.alfresco.model.ContentModel;
+import javax.activation.MimeType;
+
+import org.alfresco.api.AlfrescoPublicApi;     
+import org.alfresco.error.AlfrescoRuntimeException;
+import org.alfresco.model.ContentModel;
 import org.alfresco.repo.content.StreamAwareContentReaderProxy;
 import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
 import org.alfresco.service.cmr.dictionary.DictionaryService;
@@ -123,7 +126,8 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    private static final String PROP_DEFAULT_TIMEOUT = "content.metadataExtracter.default.timeoutMs";
    public static final String PROPERTY_PREFIX_METADATA = "metadata.";
    public static final String PROPERTY_COMPONENT_EXTRACT = ".extract.";
-    public static final String PROPERTY_COMPONENT_EMBED = ".embed.";
+    public static final String PROPERTY_COMPONENT_EMBED = ".embed.";
+    public static final int MEGABYTE_SIZE = 1048576;
    
    protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class);
    
@@ -147,7 +151,9 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    private Properties properties;
    private Map<String, MetadataExtracterLimits> mimetypeLimits;
    private ExecutorService executorService;
-    protected MetadataExtracterConfig metadataExtracterConfig;
+    protected MetadataExtracterConfig metadataExtracterConfig;
+    
+    private static final AtomicInteger CONCURRENT_EXTRACTIONS_COUNT = new AtomicInteger(0);

    /**
     * Default constructor.  If this is called, then {@link #isSupported(String)} should
@@ -1219,6 +1225,12 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
               logger.debug("Extracted Metadata from " + reader + "\n  Found: " +
                            rawMetadata + "\n  Mapped and Accepted: " + changedProperties);
            }
+        }
+        catch (LimitExceededException e)
+        {
+            logger.warn("Metadata extraction rejected: \n" + 
+                    "   Extracter: " + this + "\n" + 
+                    "   Reason:   " + e.getMessage());
        }
        catch (Throwable e)
        {
@@ -1968,23 +1980,29 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
     * Gets the metadata extracter limits for the given mimetype.
     * <p>
     * A specific match for the given mimetype is tried first and
-     * if none is found a wildcard of "*" is tried.
+     * if none is found a wildcard of "*" is tried, if still not found 
+     * defaults value will be used
     * 
     * @param mimetype String
-     * @return the found limits or null
+     * @return the found limits or default values
     */
    protected MetadataExtracterLimits getLimits(String mimetype)
    {
        if (mimetypeLimits == null)
        {
-            return null;
+            return new MetadataExtracterLimits();
        }
        MetadataExtracterLimits limits = null;
        limits = mimetypeLimits.get(mimetype);
        if (limits == null)
        {
            limits = mimetypeLimits.get("*");
-        }
+        }
+        if (limits == null)
+        {
+            limits = new MetadataExtracterLimits();
+        }
+        
        return limits;
    }
    
@@ -2027,6 +2045,19 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
        {
            super(cause);
        }
+    }
+    
+    /**
+     * Exception wrapper to handle exceeded limits imposed by {@link MetadataExtracterLimits}
+     * {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader, MetadataExtracterLimits)}
+     */
+    private class LimitExceededException extends Exception
+    {
+        private static final long serialVersionUID = 702554119174770130L;
+        public LimitExceededException(String message)
+        {
+            super(message);
+        }
    }
    
    /**
@@ -2049,12 +2080,34 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    private Map<String, Serializable> extractRaw(
            ContentReader reader, MetadataExtracterLimits limits) throws Throwable
    {
-        if (limits == null || limits.getTimeoutMs() == -1)
-        {
-            return extractRaw(reader);
-        }
        FutureTask<Map<String, Serializable>> task = null;
-        StreamAwareContentReaderProxy proxiedReader = null;
+        StreamAwareContentReaderProxy proxiedReader = null;
+        
+        if (reader.getSize() > limits.getMaxDocumentSizeMB() * MEGABYTE_SIZE)
+        {
+            throw new LimitExceededException("Max doc size exceeded " + limits.getMaxDocumentSizeMB() + " MB");
+        }
+        
+        synchronized (CONCURRENT_EXTRACTIONS_COUNT)
+        {
+            if (logger.isDebugEnabled())
+            {
+                logger.debug("Concurrent extractions : " + CONCURRENT_EXTRACTIONS_COUNT.get());
+            }
+            if (CONCURRENT_EXTRACTIONS_COUNT.get() < limits.getMaxConcurrentExtractionsCount())
+            {
+                int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.incrementAndGet();
+                if (logger.isDebugEnabled())
+                {
+                    logger.debug("New extraction accepted. Concurrent extractions : " + totalDocCount);
+                }
+            }
+            else
+            {
+                throw new LimitExceededException("Reached concurrent extractions limit - " + limits.getMaxConcurrentExtractionsCount());
+            }
+        }
+        
        try
        {
            proxiedReader = new StreamAwareContentReaderProxy(reader);
@@ -2087,6 +2140,14 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
            }
            throw cause;
        }
+        finally
+        {
+            int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.decrementAndGet();
+            if (logger.isDebugEnabled())
+            {
+                logger.debug("Extraction finalized. Remaining concurrent extraction : " + totalDocCount);
+            }
+        }
    }
    
    /**
--- a/source/java/org/alfresco/repo/content/metadata/MetadataExtracterLimits.java
+++ b/source/java/org/alfresco/repo/content/metadata/MetadataExtracterLimits.java
@@ -29,15 +29,17 @@ import org.alfresco.api.AlfrescoPublicApi;

 /**
 * Represents maximum values (that result in exceptions if exceeded) or
- * limits on values (that result in EOF (End Of File) being returned
- * early). The only current option is for elapsed time.
+ * limits on values (that result in EOF (End Of File) being returned early).
+ * The current options are elapsed time, document size and concurrent extractions limit.
 * 
 * @author Ray Gauss II
 */
@AlfrescoPublicApi
 public class MetadataExtracterLimits
 {
-    private long timeoutMs = -1;
+    private long timeoutMs = Long.MAX_VALUE;
+    private double maxDocumentSizeMB = Double.MAX_VALUE;
+    private int maxConcurrentExtractionsCount = Integer.MAX_VALUE;
    
    /**
     * Gets the time in milliseconds after which the metadata extracter will be stopped.
@@ -57,6 +59,44 @@ public class MetadataExtracterLimits
    public void setTimeoutMs(long timeoutMs)
    {
        this.timeoutMs = timeoutMs;
+    }
+    /**
+     * Gets the maximum size(MB) allowed for a transformation
+     * 
+     * @return maximum size
+     */
+    public double getMaxDocumentSizeMB()
+    {
+        return maxDocumentSizeMB;
+    }
+
+    /**
+     * Sets the maximum size(MB) allowed for a transformation
+     * 
+     * @param maxDocumentSizeMB
+     */
+    public void setMaxDocumentSizeMB(double maxDocumentSizeMB)
+    {
+        this.maxDocumentSizeMB = maxDocumentSizeMB;
+    }
+
+    /**
+     * Sets the maximum number of allowed concurrent extractions
+     * 
+     * @param maxConcurrentExtractionsCount
+     */
+    public void setMaxConcurrentExtractionsCount(int maxConcurrentExtractionsCount)
+    {
+        this.maxConcurrentExtractionsCount = maxConcurrentExtractionsCount;
+    }
+
+    /**
+     * Gets the maximum count of allowed concurrent extractions
+     * 
+     * @return maximum count
+     */
+    public int getMaxConcurrentExtractionsCount()
+    {
+        return maxConcurrentExtractionsCount;
    }
-
 }