MNT-16709 : Metadata extraction on 200MB PDF file causes large heap utilization

- added concurrent extraction limit - added max document size limit git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@131709 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-10-15 15:02:20 +00:00 · 2016-10-24 12:13:50 +00:00
parent 2b4db66fbe
commit c95cbaccd9
8 changed files with 282 additions and 94 deletions
--- a/source/test-java/org/alfresco/repo/content/metadata/MetadataExtracterLimitsTest.java
+++ b/source/test-java/org/alfresco/repo/content/metadata/MetadataExtracterLimitsTest.java
@@ -1,28 +1,28 @@
-/*
- * #%L
- * Alfresco Repository
- * %%
- * Copyright (C) 2005 - 2016 Alfresco Software Limited
- * %%
- * This file is part of the Alfresco software. 
- * If the software was purchased under a paid Alfresco license, the terms of 
- * the paid license agreement will prevail.  Otherwise, the software is 
- * provided under the following open source license terms:
- * 
- * Alfresco is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * Alfresco is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- * 
- * You should have received a copy of the GNU Lesser General Public License
- * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
- * #L%
- */
+/*
+ * #%L
+ * Alfresco Repository
+ * %%
+ * Copyright (C) 2005 - 2016 Alfresco Software Limited
+ * %%
+ * This file is part of the Alfresco software. 
+ * If the software was purchased under a paid Alfresco license, the terms of 
+ * the paid license agreement will prevail.  Otherwise, the software is 
+ * provided under the following open source license terms:
+ * 
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ * #L%
+ */
 package org.alfresco.repo.content.metadata;

 import java.io.File;
@@ -214,15 +214,6 @@ public class MetadataExtracterLimitsTest
    @Test
    public void testUnlimitedTimeout() throws Exception
    {
-        long timeoutMs = -1;
-
-        MetadataExtracterLimits limits = new MetadataExtracterLimits();
-        limits.setTimeoutMs(timeoutMs);
-        HashMap<String, MetadataExtracterLimits> mimetypeLimits =
-                new HashMap<String, MetadataExtracterLimits>(1);
-        mimetypeLimits.put(MimetypeMap.MIMETYPE_IMAGE_JPEG, limits);
-        ((MockDelayedMetadataExtracter) getExtracter()).setMimetypeLimits(mimetypeLimits);
-
        File file = AbstractContentTransformerTest.loadNamedQuickTestFile("quick.txt");

        Map<QName, Serializable> properties = extractFromFile(file, MimetypeMap.MIMETYPE_TEXT_PLAIN);
--- a/source/test-java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java
+++ b/source/test-java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java
@@ -25,12 +25,18 @@
 */
 package org.alfresco.repo.content.metadata;

+import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.Serializable;
 import java.util.Calendar;
+import java.util.HashMap;
 import java.util.Map;
-
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+
 import org.alfresco.model.ContentModel;
 import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
 import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
 import org.alfresco.service.namespace.QName;
 import org.apache.pdfbox.util.DateConverter;
@@ -42,14 +48,25 @@ import org.apache.pdfbox.util.DateConverter;
 */
 public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
 {
-    private PdfBoxMetadataExtracter extracter;
+    private PdfBoxMetadataExtracter extracter;
+    
+    private static final int MAX_CONCURENT_EXTRACTIONS = 5;
+	private static final double MAX_DOC_SIZE_MB = 0.03;

    @Override
    public void setUp() throws Exception
    {
        super.setUp();
        extracter = new PdfBoxMetadataExtracter();
-        extracter.setDictionaryService(dictionaryService);
+        extracter.setDictionaryService(dictionaryService);
+        
+        MetadataExtracterLimits pdfLimit = new MetadataExtracterLimits();
+        pdfLimit.setMaxConcurrentExtractionsCount(MAX_CONCURENT_EXTRACTIONS);
+        pdfLimit.setMaxDocumentSizeMB(MAX_DOC_SIZE_MB);
+        Map<String,MetadataExtracterLimits> limits = new HashMap<>();
+        limits.put(MimetypeMap.MIMETYPE_PDF,pdfLimit);
+        
+        extracter.setMimetypeLimits(limits);
        extracter.register();
    }

@@ -107,5 +124,49 @@ public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
       assertEquals(52, c.get(Calendar.MINUTE));
       assertEquals(58, c.get(Calendar.SECOND));
       //assertEquals(0, c.get(Calendar.MILLISECOND));
+    }
+    
+    public void testConcurrentExtractions() throws InterruptedException
+    {
+        int threadNum = 10;
+        final CountDownLatch extractionsCountDown = new CountDownLatch(threadNum);
+        for (int i = 0; i < threadNum; i++)
+        {
+            Thread t = new Thread(new Runnable()
+            {
+                @Override
+                public void run()
+                {
+                    try
+                    {
+                        Map<QName, Serializable> properties = extractFromMimetype(MimetypeMap.MIMETYPE_PDF);
+                        if (!properties.isEmpty())
+                        {
+                            extractionsCountDown.countDown();
+                        }
+                    }
+                    catch (Exception e)
+                    {
+                        e.printStackTrace();
+                    }
+                }
+            });
+            t.start();
+        }
+        extractionsCountDown.await(1000, TimeUnit.MILLISECONDS);
+        long rejectedExtractions = extractionsCountDown.getCount();
+        assertTrue("Wrong number of rejected extractions", rejectedExtractions == (threadNum - MAX_CONCURENT_EXTRACTIONS));
+    }
+
+    public void testMaxDocumentSizeLimit() throws Exception
+    {
+        File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile("quick-size-limit.pdf");
+        
+        if (sourceFile == null)
+        {
+            throw new FileNotFoundException("No quick-size-limit.pdf file found for test");
+        }
+        Map<QName, Serializable> properties = extractFromFile(sourceFile, MimetypeMap.MIMETYPE_PDF);
+        assertTrue(properties.isEmpty());
    }
 }
--- a/source/test-java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java
+++ b/source/test-java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java
@@ -1,28 +1,28 @@
-/*
- * #%L
- * Alfresco Repository
- * %%
- * Copyright (C) 2005 - 2016 Alfresco Software Limited
- * %%
- * This file is part of the Alfresco software. 
- * If the software was purchased under a paid Alfresco license, the terms of 
- * the paid license agreement will prevail.  Otherwise, the software is 
- * provided under the following open source license terms:
- * 
- * Alfresco is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * Alfresco is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- * 
- * You should have received a copy of the GNU Lesser General Public License
- * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
- * #L%
- */
+/*
+ * #%L
+ * Alfresco Repository
+ * %%
+ * Copyright (C) 2005 - 2016 Alfresco Software Limited
+ * %%
+ * This file is part of the Alfresco software. 
+ * If the software was purchased under a paid Alfresco license, the terms of 
+ * the paid license agreement will prevail.  Otherwise, the software is 
+ * provided under the following open source license terms:
+ * 
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ * #L%
+ */
 package org.alfresco.repo.content.metadata;

 import java.io.File;
@@ -48,8 +48,6 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
 {
    private static final int MINIMAL_EXPECTED_PROPERTIES_AMOUNT = 3;

-    private static final int IGNORABLE_TIMEOUT = -1;
-
    // private static final int TIMEOUT_FOR_QUICK_EXTRACTION = 2000;

    private static final int DEFAULT_FOOTNOTES_LIMIT = 50;
@@ -67,6 +65,9 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest


    private PoiMetadataExtracter extracter;
+    
+    private Long extractionTimeWithDefaultFootnotesLimit;
+    private Long extractionTimeWithLargeFootnotesLimit;

    @Override
    public void setUp() throws Exception
@@ -245,7 +246,7 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
     * 
     * @throws Exception
     */
-    public void testFootnotesLimitParameterUsing() throws Exception
+    public void testFootnotesLimitParameterUsingDefault() throws Exception
    {
        PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();

@@ -256,23 +257,42 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
        Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
        long startTime = System.currentTimeMillis();
        extractor.extract(sourceReader, properties);
-        long extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime;
+        extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime;

        assertExtractedProperties(properties);
-        assertFalse("Reader was not closed", sourceReader.isChannelOpen());
+        if (extractionTimeWithLargeFootnotesLimit != null)
+        {
+            assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
+        }
+    }
+    
+    
+    /**
+     * Test for MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for Word office document
+     * 
+     * @throws Exception
+     */
+    public void testFootnotesLimitParameterUsingLarge() throws Exception
+    {
+        PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
+
+        File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_FOOTNOTES_DOCUMENT_NAME);
+        ContentReader sourceReader = new FileContentReader(sourceFile);
+        sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING);

        // Just let the extractor do the job...
-        configureExtractorLimits(extractor, ALL_MIMETYPES_FILTER, IGNORABLE_TIMEOUT);
        extractor.setPoiFootnotesLimit(LARGE_FOOTNOTES_LIMIT);
        extractor.afterPropertiesSet();
-        properties = new HashMap<QName, Serializable>();
-        startTime = System.currentTimeMillis();
+        Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
+        long startTime = System.currentTimeMillis();
        extractor.extract(sourceReader, properties);
-        long extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime;
+        extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime;

        assertExtractedProperties(properties);
-        assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
-        assertFalse("Reader was not closed", sourceReader.isChannelOpen());
+        if (extractionTimeWithDefaultFootnotesLimit != null)
+        {
+            assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
+        }
    }

    /**