MNT-16709 : Metadata extraction on 200MB PDF file causes large heap utilization

- added concurrent extraction limit - added max document size limit git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@131709 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-10-08 14:51:49 +00:00 · 2016-10-24 12:13:50 +00:00
parent 2b4db66fbe
commit c95cbaccd9
8 changed files with 282 additions and 94 deletions
--- a/config/alfresco/content-services-context.xml
+++ b/config/alfresco/content-services-context.xml
@@ -330,6 +330,17 @@
   </bean>
   <bean id="extracter.PDFBox"        class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter"        parent="baseMetadataExtracter">
      <property name="documentSelector" ref="pdfBoxEmbededDocumentSelector" />
+      <property name="mimetypeLimits">
+         <map>
+            <entry key="application/pdf">
+               <bean class="org.alfresco.repo.content.metadata.MetadataExtracterLimits">
+                  <property name="timeoutMs" value="${content.metadataExtracter.default.timeoutMs}"/>
+                  <property name="maxDocumentSizeMB" value="${content.metadataExtracter.pdf.maxDocumentSizeMB}"/>
+                  <property name="maxConcurrentExtractionsCount" value="${content.metadataExtracter.pdf.maxConcurrentExtractionsCount}"/>
+               </bean>
+            </entry>
+         </map>
+      </property>
   </bean>
   <bean id="extracter.Poi"           class="org.alfresco.repo.content.metadata.PoiMetadataExtracter"           parent="baseMetadataExtracter">
      <property name="poiFootnotesLimit" value="${content.transformer.Poi.poiFootnotesLimit}" />
--- a/config/alfresco/repository.properties
+++ b/config/alfresco/repository.properties
@@ -640,6 +640,10 @@ content.metadataExtracter.default.timeoutMs=20000
 # Indicates if the metadata extracter should parse shape objects inside open office files
 content.metadataExtracter.parseShapes=false

+#
+content.metadataExtracter.pdf.maxDocumentSizeMB=10
+content.metadataExtracter.pdf.maxConcurrentExtractionsCount=5
+
 # Property to enable upgrade from 2.1-A
 V2.1-A.fixes.to.schema=0
 #V2.1-A.fixes.to.schema=82
--- a/config/quick/quick-size-limit.pdf
+++ b/config/quick/quick-size-limit.pdf
--- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
@@ -48,6 +48,9 @@ import java.util.concurrent.Executors;
 import java.util.concurrent.FutureTask;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import javax.activation.MimeType;

 import org.alfresco.api.AlfrescoPublicApi;     
 import org.alfresco.error.AlfrescoRuntimeException;
@@ -124,6 +127,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    public static final String PROPERTY_PREFIX_METADATA = "metadata.";
    public static final String PROPERTY_COMPONENT_EXTRACT = ".extract.";
    public static final String PROPERTY_COMPONENT_EMBED = ".embed.";
+    public static final int MEGABYTE_SIZE = 1048576;
    
    protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class);
    
@@ -149,6 +153,8 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    private ExecutorService executorService;
    protected MetadataExtracterConfig metadataExtracterConfig;
    
+    private static final AtomicInteger CONCURRENT_EXTRACTIONS_COUNT = new AtomicInteger(0);
+
    /**
     * Default constructor.  If this is called, then {@link #isSupported(String)} should
     * be implemented.  This is useful when the list of supported mimetypes is not known
@@ -1220,6 +1226,12 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
                            rawMetadata + "\n  Mapped and Accepted: " + changedProperties);
            }
        }
+        catch (LimitExceededException e)
+        {
+            logger.warn("Metadata extraction rejected: \n" + 
+                    "   Extracter: " + this + "\n" + 
+                    "   Reason:   " + e.getMessage());
+        }
        catch (Throwable e)
        {
            // Ask Tika to detect the document, and report back on if
@@ -1968,16 +1980,17 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
     * Gets the metadata extracter limits for the given mimetype.
     * <p>
     * A specific match for the given mimetype is tried first and
-     * if none is found a wildcard of "*" is tried.
+     * if none is found a wildcard of "*" is tried, if still not found 
+     * defaults value will be used
     * 
     * @param mimetype String
-     * @return the found limits or null
+     * @return the found limits or default values
     */
    protected MetadataExtracterLimits getLimits(String mimetype)
    {
        if (mimetypeLimits == null)
        {
-            return null;
+            return new MetadataExtracterLimits();
        }
        MetadataExtracterLimits limits = null;
        limits = mimetypeLimits.get(mimetype);
@@ -1985,6 +1998,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
        {
            limits = mimetypeLimits.get("*");
        }
+        if (limits == null)
+        {
+            limits = new MetadataExtracterLimits();
+        }
+        
        return limits;
    }
    
@@ -2029,6 +2047,19 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
        }
    }
    
+    /**
+     * Exception wrapper to handle exceeded limits imposed by {@link MetadataExtracterLimits}
+     * {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader, MetadataExtracterLimits)}
+     */
+    private class LimitExceededException extends Exception
+    {
+        private static final long serialVersionUID = 702554119174770130L;
+        public LimitExceededException(String message)
+        {
+            super(message);
+        }
+    }
+    
    /**
     * Calls the {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader)} method
     * using the given limits.
@@ -2049,12 +2080,34 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    private Map<String, Serializable> extractRaw(
            ContentReader reader, MetadataExtracterLimits limits) throws Throwable
    {
-        if (limits == null || limits.getTimeoutMs() == -1)
-        {
-            return extractRaw(reader);
-        }
        FutureTask<Map<String, Serializable>> task = null;
        StreamAwareContentReaderProxy proxiedReader = null;
+        
+        if (reader.getSize() > limits.getMaxDocumentSizeMB() * MEGABYTE_SIZE)
+        {
+            throw new LimitExceededException("Max doc size exceeded " + limits.getMaxDocumentSizeMB() + " MB");
+        }
+        
+        synchronized (CONCURRENT_EXTRACTIONS_COUNT)
+        {
+            if (logger.isDebugEnabled())
+            {
+                logger.debug("Concurrent extractions : " + CONCURRENT_EXTRACTIONS_COUNT.get());
+            }
+            if (CONCURRENT_EXTRACTIONS_COUNT.get() < limits.getMaxConcurrentExtractionsCount())
+            {
+                int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.incrementAndGet();
+                if (logger.isDebugEnabled())
+                {
+                    logger.debug("New extraction accepted. Concurrent extractions : " + totalDocCount);
+                }
+            }
+            else
+            {
+                throw new LimitExceededException("Reached concurrent extractions limit - " + limits.getMaxConcurrentExtractionsCount());
+            }
+        }
+        
        try
        {
            proxiedReader = new StreamAwareContentReaderProxy(reader);
@@ -2087,6 +2140,14 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
            }
            throw cause;
        }
+        finally
+        {
+            int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.decrementAndGet();
+            if (logger.isDebugEnabled())
+            {
+                logger.debug("Extraction finalized. Remaining concurrent extraction : " + totalDocCount);
+            }
+        }
    }
    
    /**
--- a/source/java/org/alfresco/repo/content/metadata/MetadataExtracterLimits.java
+++ b/source/java/org/alfresco/repo/content/metadata/MetadataExtracterLimits.java
@@ -29,15 +29,17 @@ import org.alfresco.api.AlfrescoPublicApi;

 /**
 * Represents maximum values (that result in exceptions if exceeded) or
- * limits on values (that result in EOF (End Of File) being returned
- * early). The only current option is for elapsed time.
+ * limits on values (that result in EOF (End Of File) being returned early).
+ * The current options are elapsed time, document size and concurrent extractions limit.
 * 
 * @author Ray Gauss II
 */
@AlfrescoPublicApi
 public class MetadataExtracterLimits
 {
-    private long timeoutMs = -1;
+    private long timeoutMs = Long.MAX_VALUE;
+    private double maxDocumentSizeMB = Double.MAX_VALUE;
+    private int maxConcurrentExtractionsCount = Integer.MAX_VALUE;
    
    /**
     * Gets the time in milliseconds after which the metadata extracter will be stopped.
@@ -58,5 +60,43 @@ public class MetadataExtracterLimits
    {
        this.timeoutMs = timeoutMs;
    }
+    /**
+     * Gets the maximum size(MB) allowed for a transformation
+     * 
+     * @return maximum size
+     */
+    public double getMaxDocumentSizeMB()
+    {
+        return maxDocumentSizeMB;
+    }

+    /**
+     * Sets the maximum size(MB) allowed for a transformation
+     * 
+     * @param maxDocumentSizeMB
+     */
+    public void setMaxDocumentSizeMB(double maxDocumentSizeMB)
+    {
+        this.maxDocumentSizeMB = maxDocumentSizeMB;
+    }
+
+    /**
+     * Sets the maximum number of allowed concurrent extractions
+     * 
+     * @param maxConcurrentExtractionsCount
+     */
+    public void setMaxConcurrentExtractionsCount(int maxConcurrentExtractionsCount)
+    {
+        this.maxConcurrentExtractionsCount = maxConcurrentExtractionsCount;
+    }
+
+    /**
+     * Gets the maximum count of allowed concurrent extractions
+     * 
+     * @return maximum count
+     */
+    public int getMaxConcurrentExtractionsCount()
+    {
+        return maxConcurrentExtractionsCount;
+    }
 }
--- a/source/test-java/org/alfresco/repo/content/metadata/MetadataExtracterLimitsTest.java
+++ b/source/test-java/org/alfresco/repo/content/metadata/MetadataExtracterLimitsTest.java
@@ -214,15 +214,6 @@ public class MetadataExtracterLimitsTest
    @Test
    public void testUnlimitedTimeout() throws Exception
    {
-        long timeoutMs = -1;
-
-        MetadataExtracterLimits limits = new MetadataExtracterLimits();
-        limits.setTimeoutMs(timeoutMs);
-        HashMap<String, MetadataExtracterLimits> mimetypeLimits =
-                new HashMap<String, MetadataExtracterLimits>(1);
-        mimetypeLimits.put(MimetypeMap.MIMETYPE_IMAGE_JPEG, limits);
-        ((MockDelayedMetadataExtracter) getExtracter()).setMimetypeLimits(mimetypeLimits);
-
        File file = AbstractContentTransformerTest.loadNamedQuickTestFile("quick.txt");

        Map<QName, Serializable> properties = extractFromFile(file, MimetypeMap.MIMETYPE_TEXT_PLAIN);
--- a/source/test-java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java
+++ b/source/test-java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java
@@ -25,12 +25,18 @@
 */
 package org.alfresco.repo.content.metadata;

+import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.Serializable;
 import java.util.Calendar;
+import java.util.HashMap;
 import java.util.Map;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;

 import org.alfresco.model.ContentModel;
 import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
 import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
 import org.alfresco.service.namespace.QName;
 import org.apache.pdfbox.util.DateConverter;
@@ -44,12 +50,23 @@ public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
 {
    private PdfBoxMetadataExtracter extracter;
    
+    private static final int MAX_CONCURENT_EXTRACTIONS = 5;
+	private static final double MAX_DOC_SIZE_MB = 0.03;
+
    @Override
    public void setUp() throws Exception
    {
        super.setUp();
        extracter = new PdfBoxMetadataExtracter();
        extracter.setDictionaryService(dictionaryService);
+        
+        MetadataExtracterLimits pdfLimit = new MetadataExtracterLimits();
+        pdfLimit.setMaxConcurrentExtractionsCount(MAX_CONCURENT_EXTRACTIONS);
+        pdfLimit.setMaxDocumentSizeMB(MAX_DOC_SIZE_MB);
+        Map<String,MetadataExtracterLimits> limits = new HashMap<>();
+        limits.put(MimetypeMap.MIMETYPE_PDF,pdfLimit);
+        
+        extracter.setMimetypeLimits(limits);
        extracter.register();
    }

@@ -108,4 +125,48 @@ public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
       assertEquals(58, c.get(Calendar.SECOND));
       //assertEquals(0, c.get(Calendar.MILLISECOND));
    }
+    
+    public void testConcurrentExtractions() throws InterruptedException
+    {
+        int threadNum = 10;
+        final CountDownLatch extractionsCountDown = new CountDownLatch(threadNum);
+        for (int i = 0; i < threadNum; i++)
+        {
+            Thread t = new Thread(new Runnable()
+            {
+                @Override
+                public void run()
+                {
+                    try
+                    {
+                        Map<QName, Serializable> properties = extractFromMimetype(MimetypeMap.MIMETYPE_PDF);
+                        if (!properties.isEmpty())
+                        {
+                            extractionsCountDown.countDown();
+                        }
+                    }
+                    catch (Exception e)
+                    {
+                        e.printStackTrace();
+                    }
+                }
+            });
+            t.start();
+        }
+        extractionsCountDown.await(1000, TimeUnit.MILLISECONDS);
+        long rejectedExtractions = extractionsCountDown.getCount();
+        assertTrue("Wrong number of rejected extractions", rejectedExtractions == (threadNum - MAX_CONCURENT_EXTRACTIONS));
+    }
+
+    public void testMaxDocumentSizeLimit() throws Exception
+    {
+        File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile("quick-size-limit.pdf");
+        
+        if (sourceFile == null)
+        {
+            throw new FileNotFoundException("No quick-size-limit.pdf file found for test");
+        }
+        Map<QName, Serializable> properties = extractFromFile(sourceFile, MimetypeMap.MIMETYPE_PDF);
+        assertTrue(properties.isEmpty());
+    }
 }
--- a/source/test-java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java
+++ b/source/test-java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java
@@ -48,8 +48,6 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
 {
    private static final int MINIMAL_EXPECTED_PROPERTIES_AMOUNT = 3;

-    private static final int IGNORABLE_TIMEOUT = -1;
-
    // private static final int TIMEOUT_FOR_QUICK_EXTRACTION = 2000;

    private static final int DEFAULT_FOOTNOTES_LIMIT = 50;
@@ -68,6 +66,9 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest

    private PoiMetadataExtracter extracter;
    
+    private Long extractionTimeWithDefaultFootnotesLimit;
+    private Long extractionTimeWithLargeFootnotesLimit;
+
    @Override
    public void setUp() throws Exception
    {
@@ -245,7 +246,7 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
     * 
     * @throws Exception
     */
-    public void testFootnotesLimitParameterUsing() throws Exception
+    public void testFootnotesLimitParameterUsingDefault() throws Exception
    {
        PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();

@@ -256,23 +257,42 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
        Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
        long startTime = System.currentTimeMillis();
        extractor.extract(sourceReader, properties);
-        long extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime;
+        extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime;

        assertExtractedProperties(properties);
-        assertFalse("Reader was not closed", sourceReader.isChannelOpen());
+        if (extractionTimeWithLargeFootnotesLimit != null)
+        {
+            assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
+        }
+    }
+    
+    
+    /**
+     * Test for MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for Word office document
+     * 
+     * @throws Exception
+     */
+    public void testFootnotesLimitParameterUsingLarge() throws Exception
+    {
+        PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
+
+        File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_FOOTNOTES_DOCUMENT_NAME);
+        ContentReader sourceReader = new FileContentReader(sourceFile);
+        sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING);

        // Just let the extractor do the job...
-        configureExtractorLimits(extractor, ALL_MIMETYPES_FILTER, IGNORABLE_TIMEOUT);
        extractor.setPoiFootnotesLimit(LARGE_FOOTNOTES_LIMIT);
        extractor.afterPropertiesSet();
-        properties = new HashMap<QName, Serializable>();
-        startTime = System.currentTimeMillis();
+        Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
+        long startTime = System.currentTimeMillis();
        extractor.extract(sourceReader, properties);
-        long extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime;
+        extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime;

        assertExtractedProperties(properties);
-        assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
-        assertFalse("Reader was not closed", sourceReader.isChannelOpen());
+        if (extractionTimeWithDefaultFootnotesLimit != null)
+        {
+            assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
+        }
    }

    /**