mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-10-08 14:51:49 +00:00
- added concurrent extraction limit - added max document size limit git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@131709 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
314 lines
14 KiB
Java
314 lines
14 KiB
Java
/*
|
|
* #%L
|
|
* Alfresco Repository
|
|
* %%
|
|
* Copyright (C) 2005 - 2016 Alfresco Software Limited
|
|
* %%
|
|
* This file is part of the Alfresco software.
|
|
* If the software was purchased under a paid Alfresco license, the terms of
|
|
* the paid license agreement will prevail. Otherwise, the software is
|
|
* provided under the following open source license terms:
|
|
*
|
|
* Alfresco is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Alfresco is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
|
* #L%
|
|
*/
|
|
package org.alfresco.repo.content.metadata;
|
|
|
|
import java.io.File;
|
|
import java.io.Serializable;
|
|
import java.util.HashMap;
|
|
import java.util.Map;
|
|
|
|
import org.alfresco.model.ContentModel;
|
|
import org.alfresco.repo.content.MimetypeMap;
|
|
import org.alfresco.repo.content.filestore.FileContentReader;
|
|
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
|
|
import org.alfresco.service.cmr.repository.ContentReader;
|
|
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
|
import org.alfresco.service.namespace.QName;
|
|
|
|
/**
|
|
* @see org.alfresco.repo.content.metadata.PoiMetadataExtracter
|
|
*
|
|
* @author Neil McErlean
|
|
* @author Dmitry Velichkevich
|
|
*/
|
|
public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
|
|
{
|
|
private static final int MINIMAL_EXPECTED_PROPERTIES_AMOUNT = 3;
|
|
|
|
// private static final int TIMEOUT_FOR_QUICK_EXTRACTION = 2000;
|
|
|
|
private static final int DEFAULT_FOOTNOTES_LIMIT = 50;
|
|
|
|
private static final int LARGE_FOOTNOTES_LIMIT = 25000;
|
|
|
|
|
|
private static final String ALL_MIMETYPES_FILTER = "*";
|
|
|
|
private static final String PROBLEM_FOOTNOTES_DOCUMENT_NAME = "problemFootnotes2.docx";
|
|
|
|
// private static final String PROBLEM_SLIDE_SHOW_DOCUMENT_NAME = "problemSlideShow.pptx";
|
|
|
|
private static final String EXTRACTOR_POI_BEAN_NAME = "extracter.Poi";
|
|
|
|
|
|
private PoiMetadataExtracter extracter;
|
|
|
|
private Long extractionTimeWithDefaultFootnotesLimit;
|
|
private Long extractionTimeWithLargeFootnotesLimit;
|
|
|
|
@Override
|
|
public void setUp() throws Exception
|
|
{
|
|
super.setUp();
|
|
extracter = new PoiMetadataExtracter();
|
|
extracter.setDictionaryService(dictionaryService);
|
|
resetPoiConfigurationToDefault();
|
|
extracter.register();
|
|
}
|
|
|
|
@Override
|
|
protected void tearDown() throws Exception
|
|
{
|
|
resetPoiConfigurationToDefault();
|
|
super.tearDown();
|
|
}
|
|
|
|
/**
|
|
* Resets POI library configuration to default. Sets allowable XSLF relationship types and footnotes limit as per 'extracter.Poi' bean configuration
|
|
*
|
|
* @throws Exception
|
|
*/
|
|
private void resetPoiConfigurationToDefault() throws Exception
|
|
{
|
|
PoiMetadataExtracter configuredExtractor = (PoiMetadataExtracter) ctx.getBean(EXTRACTOR_POI_BEAN_NAME);
|
|
extracter.setPoiExtractPropertiesOnly(true);
|
|
extracter.setPoiFootnotesLimit(DEFAULT_FOOTNOTES_LIMIT);
|
|
extracter.setPoiAllowableXslfRelationshipTypes(configuredExtractor.getPoiAllowableXslfRelationshipTypes());
|
|
extracter.afterPropertiesSet();
|
|
}
|
|
|
|
@Override
|
|
protected MetadataExtracter getExtracter()
|
|
{
|
|
return extracter;
|
|
}
|
|
|
|
public void testSupports() throws Exception
|
|
{
|
|
for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
|
|
{
|
|
boolean supports = extracter.isSupported(mimetype);
|
|
assertTrue("Mimetype should be supported: " + mimetype, supports);
|
|
}
|
|
}
|
|
|
|
public void testOffice2007Extraction() throws Exception
|
|
{
|
|
for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
|
|
{
|
|
testExtractFromMimetype(mimetype);
|
|
}
|
|
}
|
|
|
|
@Override
|
|
protected boolean skipDescriptionCheck(String mimetype)
|
|
{
|
|
// Our 3 OpenOffice 07 quick files have no description properties.
|
|
return true;
|
|
}
|
|
|
|
|
|
@Override
|
|
protected void testFileSpecificMetadata(String mimetype,
|
|
Map<QName, Serializable> properties)
|
|
{
|
|
// This test class is testing 3 files: quick.docx, quick.xlsx & quick.pptx.
|
|
// Their created times are hard-coded here for checking.
|
|
// Of course this means that if the files are updated, the test will break
|
|
// but those files are rarely modified - only added to.
|
|
if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype))
|
|
{
|
|
checkFileCreationDate(mimetype, properties, "2010-01-06T17:32:00.000Z");
|
|
}
|
|
else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype))
|
|
{
|
|
checkFileCreationDate(mimetype, properties, "1996-10-14T23:33:28.000Z");
|
|
}
|
|
else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype))
|
|
{
|
|
// Extraordinary! This document predates Isaac Newton's Principia Mathematica by almost a century. ;)
|
|
checkFileCreationDate(mimetype, properties, "1601-01-01T00:00:00.000Z");
|
|
}
|
|
}
|
|
|
|
private void checkFileCreationDate(String mimetype, Map<QName, Serializable> properties, String date)
|
|
{
|
|
assertEquals("Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, date,
|
|
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
|
|
}
|
|
|
|
/**
|
|
* Tests that metadata extraction from a somewhat corrupt file with several
|
|
* thousand footnotes times out properly.
|
|
*
|
|
* @throws Exception
|
|
*/
|
|
public void testProblemFootnotes() throws Exception
|
|
{
|
|
long timeoutMs = 2000;
|
|
|
|
MetadataExtracterLimits limits = new MetadataExtracterLimits();
|
|
limits.setTimeoutMs(timeoutMs);
|
|
HashMap<String, MetadataExtracterLimits> mimetypeLimits =
|
|
new HashMap<String, MetadataExtracterLimits>(1);
|
|
mimetypeLimits.put(ALL_MIMETYPES_FILTER, limits);
|
|
((PoiMetadataExtracter) getExtracter()).setMimetypeLimits(mimetypeLimits);
|
|
|
|
File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile("problemFootnotes.docx");
|
|
|
|
Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
|
|
// construct a reader onto the source file
|
|
ContentReader sourceReader = new FileContentReader(sourceFile);
|
|
sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING);
|
|
|
|
long startTime = System.currentTimeMillis();
|
|
|
|
getExtracter().extract(sourceReader, properties);
|
|
|
|
long extractionTime = System.currentTimeMillis() - startTime;
|
|
|
|
assertTrue("Metadata extraction took (" + extractionTime + "ms) " +
|
|
"but should have failed with a timeout at " + timeoutMs + "ms",
|
|
extractionTime < (timeoutMs + 100)); // bit of wiggle room for logging, cleanup, etc.
|
|
assertFalse("Reader was not closed", sourceReader.isChannelOpen());
|
|
}
|
|
|
|
// /**
|
|
// * Test for MNT-11823: Upload of PPTX causes very high memory usage leading to system instability
|
|
// *
|
|
// * @throws Exception
|
|
// */
|
|
// public void testProblemSlideShow() throws Exception
|
|
// {
|
|
// PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
|
|
// configureExtractorLimits(extractor, ALL_MIMETYPES_FILTER, TIMEOUT_FOR_QUICK_EXTRACTION);
|
|
//
|
|
// File problemSlideShowFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_SLIDE_SHOW_DOCUMENT_NAME);
|
|
// ContentReader sourceReader = new FileContentReader(problemSlideShowFile);
|
|
// sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION);
|
|
//
|
|
// Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
|
|
// extractor.extract(sourceReader, properties);
|
|
//
|
|
// assertExtractedProperties(properties);
|
|
// assertFalse("Reader was not closed", sourceReader.isChannelOpen());
|
|
//
|
|
// extractor.setPoiExtractPropertiesOnly(false);
|
|
// extractor.afterPropertiesSet();
|
|
// properties = new HashMap<QName, Serializable>();
|
|
// extractor.extract(sourceReader, properties);
|
|
//
|
|
// assertFalse("Reader was not closed", sourceReader.isChannelOpen());
|
|
// assertTrue(("Extraction completed successfully but failure is expected! Invalid properties are: " + properties), (null == properties) || properties.isEmpty());
|
|
// }
|
|
|
|
/**
|
|
* Configures timeout for given <code>extractor</code> and <code>mimetypeFilter</code>
|
|
*
|
|
* @param extractor - {@link PoiMetadataExtracter} instance
|
|
* @param mimetypeFilter - {@link String} value which specifies mimetype filter for which timeout should be applied
|
|
* @param timeout - {@link Long} value which specifies timeout for <code>mimetypeFilter</code>
|
|
*/
|
|
private void configureExtractorLimits(PoiMetadataExtracter extractor, String mimetypeFilter, long timeout)
|
|
{
|
|
MetadataExtracterLimits limits = new MetadataExtracterLimits();
|
|
limits.setTimeoutMs(timeout);
|
|
HashMap<String, MetadataExtracterLimits> mimetypeLimits = new HashMap<String, MetadataExtracterLimits>(1);
|
|
mimetypeLimits.put(mimetypeFilter, limits);
|
|
extractor.setMimetypeLimits(mimetypeLimits);
|
|
}
|
|
|
|
/**
|
|
* Test for MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for Word office document
|
|
*
|
|
* @throws Exception
|
|
*/
|
|
public void testFootnotesLimitParameterUsingDefault() throws Exception
|
|
{
|
|
PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
|
|
|
|
File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_FOOTNOTES_DOCUMENT_NAME);
|
|
ContentReader sourceReader = new FileContentReader(sourceFile);
|
|
sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING);
|
|
|
|
Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
|
|
long startTime = System.currentTimeMillis();
|
|
extractor.extract(sourceReader, properties);
|
|
extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime;
|
|
|
|
assertExtractedProperties(properties);
|
|
if (extractionTimeWithLargeFootnotesLimit != null)
|
|
{
|
|
assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* Test for MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for Word office document
|
|
*
|
|
* @throws Exception
|
|
*/
|
|
public void testFootnotesLimitParameterUsingLarge() throws Exception
|
|
{
|
|
PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
|
|
|
|
File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_FOOTNOTES_DOCUMENT_NAME);
|
|
ContentReader sourceReader = new FileContentReader(sourceFile);
|
|
sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING);
|
|
|
|
// Just let the extractor do the job...
|
|
extractor.setPoiFootnotesLimit(LARGE_FOOTNOTES_LIMIT);
|
|
extractor.afterPropertiesSet();
|
|
Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
|
|
long startTime = System.currentTimeMillis();
|
|
extractor.extract(sourceReader, properties);
|
|
extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime;
|
|
|
|
assertExtractedProperties(properties);
|
|
if (extractionTimeWithDefaultFootnotesLimit != null)
|
|
{
|
|
assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Asserts extracted <code>properties</code>. At least {@link PoiMetadataExtracterTest#MINIMAL_EXPECTED_PROPERTIES_AMOUNT} properties are expected:
|
|
* {@link ContentModel#PROP_TITLE}, {@link ContentModel#PROP_AUTHOR} and {@link ContentModel#PROP_CREATED}
|
|
*
|
|
* @param properties - {@link Map}<{@link QName}, {@link Serializable}> instance which contains all extracted properties
|
|
*/
|
|
private void assertExtractedProperties(Map<QName, Serializable> properties)
|
|
{
|
|
assertNotNull("Properties were not extracted at all!", properties);
|
|
assertFalse("Extracted properties are empty!", properties.isEmpty());
|
|
assertTrue(("Expected 3 extracted properties but only " + properties.size() + " have been extracted!"), properties.size() >= MINIMAL_EXPECTED_PROPERTIES_AMOUNT);
|
|
assertTrue(("'" + ContentModel.PROP_TITLE + "' property is missing!"), properties.containsKey(ContentModel.PROP_TITLE));
|
|
assertTrue(("'" + ContentModel.PROP_AUTHOR + "' property is missing!"), properties.containsKey(ContentModel.PROP_AUTHOR));
|
|
assertTrue(("'" + ContentModel.PROP_CREATED + "' property is missing!"), properties.containsKey(ContentModel.PROP_CREATED));
|
|
}
|
|
}
|