Alan Davis 862e07f3e2 Merged HEAD-BUG-FIX (5.0/Cloud) to HEAD (5.0/Cloud)
84058: Merged V4.2-BUG-FIX (4.2.4) to HEAD-BUG-FIX (5.0/Cloud)
      83799: MNT-12238: Merged DEV 4.2-BUG-FIX (4.2.4) to V4.2-BUG-FIX (4.2.4)
         MNT-12238: Merged 4.1-BUG-FIX (4.1.10) to V4.2-BUG-FIX (4.2.4)
            80291: Merged V4.1.6 (4.1.6.21) to V4.1-BUG-FIX (4.1.10)
               77378: Merged DEV PATCHES/V4.1.6 (19) to PATCHES/V4.1.6 (20)
                  76649: MNT-11823: Upload of PPTX causes very high memory usage leading to system instability
                     - Patch from MNT-577 has been combined with new changes to avoid hanging of analyzing complicated PPTX documents. The fix just disables reading the entire contents of the complicated document. POI metadata extractor may be switched to standard behavior or reconfigured, using the following new properties: content.transformer.Poi.poiFootnotesLimit, content.transformer.Poi.poiExtractPropertiesOnly and content-services-context.xml/extracter.Poi/poiAllowableXslfRelationshipTypes
                  77379: MNT-11823: Upload of PPTX causes very high memory usage leading to system instability
                     Test and the test data for MNT-577 have been added. Test for MNT-11823 has also been added. But this test is commented because the test data (appropriate PPTX document) is not currently available. Getters for POI specific properties have been added to 'PoiMetadataExtracter' for tests. Also 'afterPropertiesSet()' logic has been a bit modified to allow setting 'false' value for 'poiExtractPropertiesOnly' parameter
                  77561: MNT-11823: Upload of PPTX causes very high memory usage leading to system instability
                     Fix for https://bamboo.alfresco.com/bamboo/browse/HF-PATCH416-126 build failure. POI extractor and transformer properties of 'AlfrescoPoiPatchUtils' have been isolated from each other using context. Each extractor or transformer now has its own context or uses the default context. Properties of the default context allow parsing the entire contents of XLSF documents. And footnotes limit is 50. Property names have not been changed, but currently 'content-services-context.xml/extracter.Poi/poiAllowableXslfRelationshipTypes=null' does not lead to 'content.transformer.Poi.poiExtractPropertiesOnly=false'. I. e., this list may be empty. 'PoiMetadataExtracterTest' test has been modified in accordance with the introduced changes. 'poi-OOXML-3.9-beta1-20121109.jar' has been renamed to 'poi-OOXML-3.9-beta1-20121109-patched.jar'
                  79180: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability
                     Timeout mechanism has been added to content transformers. Timeout configuration options have been added. Also mechanism to close streams after 'TimoutException' has been added to transformers and metadata extractors. Also timeout mechanism for input streams has been enabled in 'TikaPoweredContentTransformer'
                  79268: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability
                     Fix for the https://bamboo.alfresco.com/bamboo/browse/HF-PATCH416-133 build failure and comments of the review https://fisheye.alfresco.com/cru/CR-100#CFR-1184. The new test has been added into 'PoiOOXMLContentTransformerTest.testMnt12043()' to check out the newly added timeout mechanism
                  79290: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability
                     - Removed methods and properties that are no longer needed
                  79327: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability
                     - Increased ADDITIONAL_PROCESSING_TIME to 1500ms to try and avoid a new intermittent test failure.
      83885: MNT-12238 Bring Maven POM file in sync with latest patched version of poi-ooxml


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@84627 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2014-09-18 17:23:49 +00:00

287 lines
12 KiB
Java

/*
* Copyright (C) 2005-2014 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.metadata;
import java.io.File;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.filestore.FileContentReader;
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.namespace.QName;
/**
* @see org.alfresco.repo.content.metadata.PoiMetadataExtracter
*
* @author Neil McErlean
* @author Dmitry Velichkevich
*/
public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
{
private static final int MINIMAL_EXPECTED_PROPERTIES_AMOUNT = 3;
private static final int IGNORABLE_TIMEOUT = -1;
// private static final int TIMEOUT_FOR_QUICK_EXTRACTION = 2000;
private static final int DEFAULT_FOOTNOTES_LIMIT = 50;
private static final int LARGE_FOOTNOTES_LIMIT = 25000;
private static final String ALL_MIMETYPES_FILTER = "*";
private static final String PROBLEM_FOOTNOTES_DOCUMENT_NAME = "problemFootnotes2.docx";
// private static final String PROBLEM_SLIDE_SHOW_DOCUMENT_NAME = "problemSlideShow.pptx";
private static final String EXTRACTOR_POI_BEAN_NAME = "extracter.Poi";
private PoiMetadataExtracter extracter;
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = new PoiMetadataExtracter();
extracter.setDictionaryService(dictionaryService);
resetPoiConfigurationToDefault();
extracter.register();
}
@Override
protected void tearDown() throws Exception
{
resetPoiConfigurationToDefault();
super.tearDown();
}
/**
* Resets POI library configuration to default. Sets allowable XSLF relationship types and footnotes limit as per 'extracter.Poi' bean configuration
*
* @throws Exception
*/
private void resetPoiConfigurationToDefault() throws Exception
{
PoiMetadataExtracter configuredExtractor = (PoiMetadataExtracter) ctx.getBean(EXTRACTOR_POI_BEAN_NAME);
extracter.setPoiExtractPropertiesOnly(true);
extracter.setPoiFootnotesLimit(DEFAULT_FOOTNOTES_LIMIT);
extracter.setPoiAllowableXslfRelationshipTypes(configuredExtractor.getPoiAllowableXslfRelationshipTypes());
extracter.afterPropertiesSet();
}
@Override
protected MetadataExtracter getExtracter()
{
return extracter;
}
public void testSupports() throws Exception
{
for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
{
boolean supports = extracter.isSupported(mimetype);
assertTrue("Mimetype should be supported: " + mimetype, supports);
}
}
public void testOffice2007Extraction() throws Exception
{
for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
{
testExtractFromMimetype(mimetype);
}
}
@Override
protected boolean skipDescriptionCheck(String mimetype)
{
// Our 3 OpenOffice 07 quick files have no description properties.
return true;
}
@Override
protected void testFileSpecificMetadata(String mimetype,
Map<QName, Serializable> properties)
{
// This test class is testing 3 files: quick.docx, quick.xlsx & quick.pptx.
// Their created times are hard-coded here for checking.
// Of course this means that if the files are updated, the test will break
// but those files are rarely modified - only added to.
if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype))
{
checkFileCreationDate(mimetype, properties, "2010-01-06T17:32:00.000Z");
}
else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype))
{
checkFileCreationDate(mimetype, properties, "1996-10-14T23:33:28.000Z");
}
else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype))
{
// Extraordinary! This document predates Isaac Newton's Principia Mathematica by almost a century. ;)
checkFileCreationDate(mimetype, properties, "1601-01-01T00:00:00.000Z");
}
}
private void checkFileCreationDate(String mimetype, Map<QName, Serializable> properties, String date)
{
assertEquals("Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, date,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
}
/**
* Tests that metadata extraction from a somewhat corrupt file with several
* thousand footnotes times out properly.
*
* @throws Exception
*/
public void testProblemFootnotes() throws Exception
{
long timeoutMs = 2000;
MetadataExtracterLimits limits = new MetadataExtracterLimits();
limits.setTimeoutMs(timeoutMs);
HashMap<String, MetadataExtracterLimits> mimetypeLimits =
new HashMap<String, MetadataExtracterLimits>(1);
mimetypeLimits.put(ALL_MIMETYPES_FILTER, limits);
((PoiMetadataExtracter) getExtracter()).setMimetypeLimits(mimetypeLimits);
File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile("problemFootnotes.docx");
Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
// construct a reader onto the source file
ContentReader sourceReader = new FileContentReader(sourceFile);
sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING);
long startTime = System.currentTimeMillis();
getExtracter().extract(sourceReader, properties);
long extractionTime = System.currentTimeMillis() - startTime;
assertTrue("Metadata extraction took (" + extractionTime + "ms) " +
"but should have failed with a timeout at " + timeoutMs + "ms",
extractionTime < (timeoutMs + 100)); // bit of wiggle room for logging, cleanup, etc.
assertFalse("Reader was not closed", sourceReader.isChannelOpen());
}
// /**
// * Test for MNT-11823: Upload of PPTX causes very high memory usage leading to system instability
// *
// * @throws Exception
// */
// public void testProblemSlideShow() throws Exception
// {
// PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
// configureExtractorLimits(extractor, ALL_MIMETYPES_FILTER, TIMEOUT_FOR_QUICK_EXTRACTION);
//
// File problemSlideShowFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_SLIDE_SHOW_DOCUMENT_NAME);
// ContentReader sourceReader = new FileContentReader(problemSlideShowFile);
// sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION);
//
// Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
// extractor.extract(sourceReader, properties);
//
// assertExtractedProperties(properties);
// assertFalse("Reader was not closed", sourceReader.isChannelOpen());
//
// extractor.setPoiExtractPropertiesOnly(false);
// extractor.afterPropertiesSet();
// properties = new HashMap<QName, Serializable>();
// extractor.extract(sourceReader, properties);
//
// assertFalse("Reader was not closed", sourceReader.isChannelOpen());
// assertTrue(("Extraction completed successfully but failure is expected! Invalid properties are: " + properties), (null == properties) || properties.isEmpty());
// }
/**
* Configures timeout for given <code>extractor</code> and <code>mimetypeFilter</code>
*
* @param extractor - {@link PoiMetadataExtracter} instance
* @param mimetypeFilter - {@link String} value which specifies mimetype filter for which timeout should be applied
* @param timeout - {@link Long} value which specifies timeout for <code>mimetypeFilter</code>
*/
private void configureExtractorLimits(PoiMetadataExtracter extractor, String mimetypeFilter, long timeout)
{
MetadataExtracterLimits limits = new MetadataExtracterLimits();
limits.setTimeoutMs(timeout);
HashMap<String, MetadataExtracterLimits> mimetypeLimits = new HashMap<String, MetadataExtracterLimits>(1);
mimetypeLimits.put(mimetypeFilter, limits);
extractor.setMimetypeLimits(mimetypeLimits);
}
/**
* Test for MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for Word office document
*
* @throws Exception
*/
public void testFootnotesLimitParameterUsing() throws Exception
{
PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_FOOTNOTES_DOCUMENT_NAME);
ContentReader sourceReader = new FileContentReader(sourceFile);
sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING);
Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
long startTime = System.currentTimeMillis();
extractor.extract(sourceReader, properties);
long extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime;
assertExtractedProperties(properties);
assertFalse("Reader was not closed", sourceReader.isChannelOpen());
// Just let the extractor do the job...
configureExtractorLimits(extractor, ALL_MIMETYPES_FILTER, IGNORABLE_TIMEOUT);
extractor.setPoiFootnotesLimit(LARGE_FOOTNOTES_LIMIT);
extractor.afterPropertiesSet();
properties = new HashMap<QName, Serializable>();
startTime = System.currentTimeMillis();
extractor.extract(sourceReader, properties);
long extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime;
assertExtractedProperties(properties);
assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
assertFalse("Reader was not closed", sourceReader.isChannelOpen());
}
/**
* Asserts extracted <code>properties</code>. At least {@link PoiMetadataExtracterTest#MINIMAL_EXPECTED_PROPERTIES_AMOUNT} properties are expected:
* {@link ContentModel#PROP_TITLE}, {@link ContentModel#PROP_AUTHOR} and {@link ContentModel#PROP_CREATED}
*
* @param properties - {@link Map}&lt;{@link QName}, {@link Serializable}&gt; instance which contains all extracted properties
*/
private void assertExtractedProperties(Map<QName, Serializable> properties)
{
assertNotNull("Properties were not extracted at all!", properties);
assertFalse("Extracted properties are empty!", properties.isEmpty());
assertTrue(("Expected 3 extracted properties but only " + properties.size() + " have been extracted!"), properties.size() >= MINIMAL_EXPECTED_PROPERTIES_AMOUNT);
assertTrue(("'" + ContentModel.PROP_TITLE + "' property is missing!"), properties.containsKey(ContentModel.PROP_TITLE));
assertTrue(("'" + ContentModel.PROP_AUTHOR + "' property is missing!"), properties.containsKey(ContentModel.PROP_AUTHOR));
assertTrue(("'" + ContentModel.PROP_CREATED + "' property is missing!"), properties.containsKey(ContentModel.PROP_CREATED));
}
}