MNT-16709 : Metadata extraction on 200MB PDF file causes large heap utilization

- added concurrent extraction limit - added max document size limit git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@131709 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-10-08 14:51:49 +00:00 · 2016-10-24 12:13:50 +00:00
parent 2b4db66fbe
commit c95cbaccd9
8 changed files with 282 additions and 94 deletions
--- a/config/alfresco/content-services-context.xml
+++ b/config/alfresco/content-services-context.xml
@@ -330,6 +330,17 @@
   </bean>
   <bean id="extracter.PDFBox"        class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter"        parent="baseMetadataExtracter">
      <property name="documentSelector" ref="pdfBoxEmbededDocumentSelector" />
      <property name="mimetypeLimits">
         <map>
            <entry key="application/pdf">
               <bean class="org.alfresco.repo.content.metadata.MetadataExtracterLimits">
                  <property name="timeoutMs" value="${content.metadataExtracter.default.timeoutMs}"/>
                  <property name="maxDocumentSizeMB" value="${content.metadataExtracter.pdf.maxDocumentSizeMB}"/>
                  <property name="maxConcurrentExtractionsCount" value="${content.metadataExtracter.pdf.maxConcurrentExtractionsCount}"/>
               </bean>
            </entry>
         </map>
      </property>
   </bean>
   <bean id="extracter.Poi"           class="org.alfresco.repo.content.metadata.PoiMetadataExtracter"           parent="baseMetadataExtracter">
      <property name="poiFootnotesLimit" value="${content.transformer.Poi.poiFootnotesLimit}" />
--- a/config/alfresco/repository.properties
+++ b/config/alfresco/repository.properties
@@ -640,6 +640,10 @@ content.metadataExtracter.default.timeoutMs=20000
 # Indicates if the metadata extracter should parse shape objects inside open office files
 content.metadataExtracter.parseShapes=false
 #
 content.metadataExtracter.pdf.maxDocumentSizeMB=10
 content.metadataExtracter.pdf.maxConcurrentExtractionsCount=5
 # Property to enable upgrade from 2.1-A
 V2.1-A.fixes.to.schema=0
 #V2.1-A.fixes.to.schema=82
--- a/config/quick/quick-size-limit.pdf
+++ b/config/quick/quick-size-limit.pdf
--- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
@@ -48,6 +48,9 @@ import java.util.concurrent.Executors;
 import java.util.concurrent.FutureTask;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicInteger;
 import javax.activation.MimeType;
 import org.alfresco.api.AlfrescoPublicApi;     
 import org.alfresco.error.AlfrescoRuntimeException;
@@ -124,6 +127,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    public static final String PROPERTY_PREFIX_METADATA = "metadata.";
    public static final String PROPERTY_COMPONENT_EXTRACT = ".extract.";
    public static final String PROPERTY_COMPONENT_EMBED = ".embed.";
    public static final int MEGABYTE_SIZE = 1048576;
    protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class);
@@ -149,6 +153,8 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    private ExecutorService executorService;
    protected MetadataExtracterConfig metadataExtracterConfig;
    private static final AtomicInteger CONCURRENT_EXTRACTIONS_COUNT = new AtomicInteger(0);
    /**
     * Default constructor.  If this is called, then {@link #isSupported(String)} should
     * be implemented.  This is useful when the list of supported mimetypes is not known
@@ -1220,6 +1226,12 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
                            rawMetadata + "\n  Mapped and Accepted: " + changedProperties);
            }
        }
        catch (LimitExceededException e)
        {
            logger.warn("Metadata extraction rejected: \n" + 
                    "   Extracter: " + this + "\n" + 
                    "   Reason:   " + e.getMessage());
        }
        catch (Throwable e)
        {
            // Ask Tika to detect the document, and report back on if
@@ -1968,16 +1980,17 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
     * Gets the metadata extracter limits for the given mimetype.
     * <p>
     * A specific match for the given mimetype is tried first and
-     * if none is found a wildcard of "*" is tried.
+     * if none is found a wildcard of "*" is tried, if still not found 
     * defaults value will be used
     * 
     * @param mimetype String
-     * @return the found limits or null
+     * @return the found limits or default values
     */
    protected MetadataExtracterLimits getLimits(String mimetype)
    {
        if (mimetypeLimits == null)
        {
-            return null;
+            return new MetadataExtracterLimits();
        }
        MetadataExtracterLimits limits = null;
        limits = mimetypeLimits.get(mimetype);
@@ -1985,6 +1998,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
        {
            limits = mimetypeLimits.get("*");
        }
        if (limits == null)
        {
            limits = new MetadataExtracterLimits();
        }
        return limits;
    }
@@ -2029,6 +2047,19 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
        }
    }
    /**
     * Exception wrapper to handle exceeded limits imposed by {@link MetadataExtracterLimits}
     * {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader, MetadataExtracterLimits)}
     */
    private class LimitExceededException extends Exception
    {
        private static final long serialVersionUID = 702554119174770130L;
        public LimitExceededException(String message)
        {
            super(message);
        }
    }
    /**
     * Calls the {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader)} method
     * using the given limits.
@@ -2049,12 +2080,34 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
    private Map<String, Serializable> extractRaw(
            ContentReader reader, MetadataExtracterLimits limits) throws Throwable
    {
        if (limits == null || limits.getTimeoutMs() == -1)
        {
            return extractRaw(reader);
        }
        FutureTask<Map<String, Serializable>> task = null;
        StreamAwareContentReaderProxy proxiedReader = null;
        if (reader.getSize() > limits.getMaxDocumentSizeMB() * MEGABYTE_SIZE)
        {
            throw new LimitExceededException("Max doc size exceeded " + limits.getMaxDocumentSizeMB() + " MB");
        }
        synchronized (CONCURRENT_EXTRACTIONS_COUNT)
        {
            if (logger.isDebugEnabled())
            {
                logger.debug("Concurrent extractions : " + CONCURRENT_EXTRACTIONS_COUNT.get());
            }
            if (CONCURRENT_EXTRACTIONS_COUNT.get() < limits.getMaxConcurrentExtractionsCount())
            {
                int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.incrementAndGet();
                if (logger.isDebugEnabled())
                {
                    logger.debug("New extraction accepted. Concurrent extractions : " + totalDocCount);
                }
            }
            else
            {
                throw new LimitExceededException("Reached concurrent extractions limit - " + limits.getMaxConcurrentExtractionsCount());
            }
        }
        try
        {
            proxiedReader = new StreamAwareContentReaderProxy(reader);
@@ -2087,6 +2140,14 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
            }
            throw cause;
        }
        finally
        {
            int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.decrementAndGet();
            if (logger.isDebugEnabled())
            {
                logger.debug("Extraction finalized. Remaining concurrent extraction : " + totalDocCount);
            }
        }
    }
    /**
--- a/source/java/org/alfresco/repo/content/metadata/MetadataExtracterLimits.java
+++ b/source/java/org/alfresco/repo/content/metadata/MetadataExtracterLimits.java
@@ -29,15 +29,17 @@ import org.alfresco.api.AlfrescoPublicApi;
 /**
 * Represents maximum values (that result in exceptions if exceeded) or
- * limits on values (that result in EOF (End Of File) being returned
+ * limits on values (that result in EOF (End Of File) being returned early).
- * early). The only current option is for elapsed time.
+ * The current options are elapsed time, document size and concurrent extractions limit.
 * 
 * @author Ray Gauss II
 */
@AlfrescoPublicApi
 public class MetadataExtracterLimits
 {
-    private long timeoutMs = -1;
+    private long timeoutMs = Long.MAX_VALUE;
    private double maxDocumentSizeMB = Double.MAX_VALUE;
    private int maxConcurrentExtractionsCount = Integer.MAX_VALUE;
    /**
     * Gets the time in milliseconds after which the metadata extracter will be stopped.
@@ -58,5 +60,43 @@ public class MetadataExtracterLimits
    {
        this.timeoutMs = timeoutMs;
    }
    /**
     * Gets the maximum size(MB) allowed for a transformation
     * 
     * @return maximum size
     */
    public double getMaxDocumentSizeMB()
    {
        return maxDocumentSizeMB;
    }
    /**
     * Sets the maximum size(MB) allowed for a transformation
     * 
     * @param maxDocumentSizeMB
     */
    public void setMaxDocumentSizeMB(double maxDocumentSizeMB)
    {
        this.maxDocumentSizeMB = maxDocumentSizeMB;
    }
    /**
     * Sets the maximum number of allowed concurrent extractions
     * 
     * @param maxConcurrentExtractionsCount
     */
    public void setMaxConcurrentExtractionsCount(int maxConcurrentExtractionsCount)
    {
        this.maxConcurrentExtractionsCount = maxConcurrentExtractionsCount;
    }
    /**
     * Gets the maximum count of allowed concurrent extractions
     * 
     * @return maximum count
     */
    public int getMaxConcurrentExtractionsCount()
    {
        return maxConcurrentExtractionsCount;
    }
 }
--- a/source/test-java/org/alfresco/repo/content/metadata/MetadataExtracterLimitsTest.java
+++ b/source/test-java/org/alfresco/repo/content/metadata/MetadataExtracterLimitsTest.java
@@ -214,15 +214,6 @@ public class MetadataExtracterLimitsTest
    @Test
    public void testUnlimitedTimeout() throws Exception
    {
        long timeoutMs = -1;
        MetadataExtracterLimits limits = new MetadataExtracterLimits();
        limits.setTimeoutMs(timeoutMs);
        HashMap<String, MetadataExtracterLimits> mimetypeLimits =
                new HashMap<String, MetadataExtracterLimits>(1);
        mimetypeLimits.put(MimetypeMap.MIMETYPE_IMAGE_JPEG, limits);
        ((MockDelayedMetadataExtracter) getExtracter()).setMimetypeLimits(mimetypeLimits);
        File file = AbstractContentTransformerTest.loadNamedQuickTestFile("quick.txt");
        Map<QName, Serializable> properties = extractFromFile(file, MimetypeMap.MIMETYPE_TEXT_PLAIN);
--- a/source/test-java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java
+++ b/source/test-java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java
@@ -25,12 +25,18 @@
 */
 package org.alfresco.repo.content.metadata;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.Serializable;
 import java.util.Calendar;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 import org.alfresco.model.ContentModel;
 import org.alfresco.repo.content.MimetypeMap;
 import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
 import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
 import org.alfresco.service.namespace.QName;
 import org.apache.pdfbox.util.DateConverter;
@@ -44,12 +50,23 @@ public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
 {
    private PdfBoxMetadataExtracter extracter;
    private static final int MAX_CONCURENT_EXTRACTIONS = 5;
 	private static final double MAX_DOC_SIZE_MB = 0.03;
    @Override
    public void setUp() throws Exception
    {
        super.setUp();
        extracter = new PdfBoxMetadataExtracter();
        extracter.setDictionaryService(dictionaryService);
        MetadataExtracterLimits pdfLimit = new MetadataExtracterLimits();
        pdfLimit.setMaxConcurrentExtractionsCount(MAX_CONCURENT_EXTRACTIONS);
        pdfLimit.setMaxDocumentSizeMB(MAX_DOC_SIZE_MB);
        Map<String,MetadataExtracterLimits> limits = new HashMap<>();
        limits.put(MimetypeMap.MIMETYPE_PDF,pdfLimit);
        extracter.setMimetypeLimits(limits);
        extracter.register();
    }
@@ -108,4 +125,48 @@ public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
       assertEquals(58, c.get(Calendar.SECOND));
       //assertEquals(0, c.get(Calendar.MILLISECOND));
    }
    public void testConcurrentExtractions() throws InterruptedException
    {
        int threadNum = 10;
        final CountDownLatch extractionsCountDown = new CountDownLatch(threadNum);
        for (int i = 0; i < threadNum; i++)
        {
            Thread t = new Thread(new Runnable()
            {
                @Override
                public void run()
                {
                    try
                    {
                        Map<QName, Serializable> properties = extractFromMimetype(MimetypeMap.MIMETYPE_PDF);
                        if (!properties.isEmpty())
                        {
                            extractionsCountDown.countDown();
                        }
                    }
                    catch (Exception e)
                    {
                        e.printStackTrace();
                    }
                }
            });
            t.start();
        }
        extractionsCountDown.await(1000, TimeUnit.MILLISECONDS);
        long rejectedExtractions = extractionsCountDown.getCount();
        assertTrue("Wrong number of rejected extractions", rejectedExtractions == (threadNum - MAX_CONCURENT_EXTRACTIONS));
    }
    public void testMaxDocumentSizeLimit() throws Exception
    {
        File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile("quick-size-limit.pdf");
        if (sourceFile == null)
        {
            throw new FileNotFoundException("No quick-size-limit.pdf file found for test");
        }
        Map<QName, Serializable> properties = extractFromFile(sourceFile, MimetypeMap.MIMETYPE_PDF);
        assertTrue(properties.isEmpty());
    }
 }
--- a/source/test-java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java
+++ b/source/test-java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java
@@ -48,8 +48,6 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
 {
    private static final int MINIMAL_EXPECTED_PROPERTIES_AMOUNT = 3;
    private static final int IGNORABLE_TIMEOUT = -1;
    // private static final int TIMEOUT_FOR_QUICK_EXTRACTION = 2000;
    private static final int DEFAULT_FOOTNOTES_LIMIT = 50;
@@ -68,6 +66,9 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
    private PoiMetadataExtracter extracter;
    private Long extractionTimeWithDefaultFootnotesLimit;
    private Long extractionTimeWithLargeFootnotesLimit;
    @Override
    public void setUp() throws Exception
    {
@@ -245,7 +246,7 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
     * 
     * @throws Exception
     */
-    public void testFootnotesLimitParameterUsing() throws Exception
+    public void testFootnotesLimitParameterUsingDefault() throws Exception
    {
        PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
@@ -256,23 +257,42 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
        Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
        long startTime = System.currentTimeMillis();
        extractor.extract(sourceReader, properties);
-        long extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime;
+        extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime;
        assertExtractedProperties(properties);
-        assertFalse("Reader was not closed", sourceReader.isChannelOpen());
+        if (extractionTimeWithLargeFootnotesLimit != null)
        {
            assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
        }
    }
    /**
     * Test for MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for Word office document
     * 
     * @throws Exception
     */
    public void testFootnotesLimitParameterUsingLarge() throws Exception
    {
        PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
        File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_FOOTNOTES_DOCUMENT_NAME);
        ContentReader sourceReader = new FileContentReader(sourceFile);
        sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING);
        // Just let the extractor do the job...
        configureExtractorLimits(extractor, ALL_MIMETYPES_FILTER, IGNORABLE_TIMEOUT);
        extractor.setPoiFootnotesLimit(LARGE_FOOTNOTES_LIMIT);
        extractor.afterPropertiesSet();
-        properties = new HashMap<QName, Serializable>();
+        Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
-        startTime = System.currentTimeMillis();
+        long startTime = System.currentTimeMillis();
        extractor.extract(sourceReader, properties);
-        long extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime;
+        extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime;
        assertExtractedProperties(properties);
-        assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
+        if (extractionTimeWithDefaultFootnotesLimit != null)
-        assertFalse("Reader was not closed", sourceReader.isChannelOpen());
+        {
            assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
        }
    }
    /**