mirror of
				https://github.com/Alfresco/alfresco-community-repo.git
				synced 2025-10-29 15:21:53 +00:00 
			
		
		
		
	- added concurrent extraction limit - added max document size limit git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@131709 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
		
			
				
	
	
		
			314 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Java
		
	
	
	
	
	
			
		
		
	
	
			314 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Java
		
	
	
	
	
	
| /*
 | |
|  * #%L
 | |
|  * Alfresco Repository
 | |
|  * %%
 | |
|  * Copyright (C) 2005 - 2016 Alfresco Software Limited
 | |
|  * %%
 | |
|  * This file is part of the Alfresco software. 
 | |
|  * If the software was purchased under a paid Alfresco license, the terms of 
 | |
|  * the paid license agreement will prevail.  Otherwise, the software is 
 | |
|  * provided under the following open source license terms:
 | |
|  * 
 | |
|  * Alfresco is free software: you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU Lesser General Public License as published by
 | |
|  * the Free Software Foundation, either version 3 of the License, or
 | |
|  * (at your option) any later version.
 | |
|  * 
 | |
|  * Alfresco is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  * GNU Lesser General Public License for more details.
 | |
|  * 
 | |
|  * You should have received a copy of the GNU Lesser General Public License
 | |
|  * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 | |
|  * #L%
 | |
|  */
 | |
| package org.alfresco.repo.content.metadata;
 | |
| 
 | |
| import java.io.File;
 | |
| import java.io.Serializable;
 | |
| import java.util.HashMap;
 | |
| import java.util.Map;
 | |
| 
 | |
| import org.alfresco.model.ContentModel;
 | |
| import org.alfresco.repo.content.MimetypeMap;
 | |
| import org.alfresco.repo.content.filestore.FileContentReader;
 | |
| import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
 | |
| import org.alfresco.service.cmr.repository.ContentReader;
 | |
| import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
 | |
| import org.alfresco.service.namespace.QName;
 | |
| 
 | |
| /**
 | |
|  * @see org.alfresco.repo.content.metadata.PoiMetadataExtracter
 | |
|  * 
 | |
|  * @author Neil McErlean
 | |
|  * @author Dmitry Velichkevich
 | |
|  */
 | |
| public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
 | |
| {
 | |
|     private static final int MINIMAL_EXPECTED_PROPERTIES_AMOUNT = 3;
 | |
| 
 | |
|     // private static final int TIMEOUT_FOR_QUICK_EXTRACTION = 2000;
 | |
| 
 | |
|     private static final int DEFAULT_FOOTNOTES_LIMIT = 50;
 | |
| 
 | |
|     private static final int LARGE_FOOTNOTES_LIMIT = 25000;
 | |
| 
 | |
| 
 | |
|     private static final String ALL_MIMETYPES_FILTER = "*";
 | |
| 
 | |
|     private static final String PROBLEM_FOOTNOTES_DOCUMENT_NAME = "problemFootnotes2.docx";
 | |
| 
 | |
|     // private static final String PROBLEM_SLIDE_SHOW_DOCUMENT_NAME = "problemSlideShow.pptx";
 | |
| 
 | |
|     private static final String EXTRACTOR_POI_BEAN_NAME = "extracter.Poi";
 | |
| 
 | |
| 
 | |
|     private PoiMetadataExtracter extracter;
 | |
|     
 | |
|     private Long extractionTimeWithDefaultFootnotesLimit;
 | |
|     private Long extractionTimeWithLargeFootnotesLimit;
 | |
| 
 | |
|     @Override
 | |
|     public void setUp() throws Exception
 | |
|     {
 | |
|         super.setUp();
 | |
|         extracter = new PoiMetadataExtracter();
 | |
|         extracter.setDictionaryService(dictionaryService);
 | |
|         resetPoiConfigurationToDefault();
 | |
|         extracter.register();
 | |
|     }
 | |
| 
 | |
|     @Override
 | |
|     protected void tearDown() throws Exception
 | |
|     {
 | |
|         resetPoiConfigurationToDefault();
 | |
|         super.tearDown();
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Resets POI library configuration to default. Sets allowable XSLF relationship types and footnotes limit as per 'extracter.Poi' bean configuration
 | |
|      * 
 | |
|      * @throws Exception
 | |
|      */
 | |
|     private void resetPoiConfigurationToDefault() throws Exception
 | |
|     {
 | |
|         PoiMetadataExtracter configuredExtractor = (PoiMetadataExtracter) ctx.getBean(EXTRACTOR_POI_BEAN_NAME);
 | |
|         extracter.setPoiExtractPropertiesOnly(true);
 | |
|         extracter.setPoiFootnotesLimit(DEFAULT_FOOTNOTES_LIMIT);
 | |
|         extracter.setPoiAllowableXslfRelationshipTypes(configuredExtractor.getPoiAllowableXslfRelationshipTypes());
 | |
|         extracter.afterPropertiesSet();
 | |
|     }
 | |
| 
 | |
|     @Override
 | |
|     protected MetadataExtracter getExtracter()
 | |
|     {
 | |
|         return extracter;
 | |
|     }
 | |
| 
 | |
|     public void testSupports() throws Exception
 | |
|     {
 | |
|         for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
 | |
|         {
 | |
|             boolean supports = extracter.isSupported(mimetype);
 | |
|             assertTrue("Mimetype should be supported: " + mimetype, supports);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     public void testOffice2007Extraction() throws Exception
 | |
|     {
 | |
|         for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
 | |
|         {
 | |
|             testExtractFromMimetype(mimetype);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     @Override
 | |
|     protected boolean skipDescriptionCheck(String mimetype) 
 | |
|     {
 | |
|         // Our 3 OpenOffice 07 quick files have no description properties.
 | |
|         return true;
 | |
|     }
 | |
| 
 | |
| 
 | |
|     @Override
 | |
|     protected void testFileSpecificMetadata(String mimetype,
 | |
|          Map<QName, Serializable> properties) 
 | |
|     {
 | |
|         // This test class is testing 3 files: quick.docx, quick.xlsx & quick.pptx.
 | |
|         // Their created times are hard-coded here for checking.
 | |
|         // Of course this means that if the files are updated, the test will break
 | |
|         // but those files are rarely modified - only added to.
 | |
|         if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype))
 | |
|         {
 | |
|             checkFileCreationDate(mimetype, properties, "2010-01-06T17:32:00.000Z");
 | |
|         }
 | |
|         else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype))
 | |
|         {
 | |
|             checkFileCreationDate(mimetype, properties, "1996-10-14T23:33:28.000Z");
 | |
|         }
 | |
|         else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype))
 | |
|         {
 | |
|             // Extraordinary! This document predates Isaac Newton's Principia Mathematica by almost a century. ;)
 | |
|             checkFileCreationDate(mimetype, properties, "1601-01-01T00:00:00.000Z");
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     private void checkFileCreationDate(String mimetype, Map<QName, Serializable> properties, String date)
 | |
|     {
 | |
|         assertEquals("Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, date,
 | |
|                 DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * Tests that metadata extraction from a somewhat corrupt file with several
 | |
|      * thousand footnotes times out properly.
 | |
|      * 
 | |
|      * @throws Exception
 | |
|      */
 | |
|     public void testProblemFootnotes() throws Exception
 | |
|     {
 | |
|         long timeoutMs = 2000;
 | |
|         
 | |
|         MetadataExtracterLimits limits = new MetadataExtracterLimits();
 | |
|         limits.setTimeoutMs(timeoutMs);
 | |
|         HashMap<String, MetadataExtracterLimits> mimetypeLimits =
 | |
|                 new HashMap<String, MetadataExtracterLimits>(1);
 | |
|         mimetypeLimits.put(ALL_MIMETYPES_FILTER, limits);
 | |
|         ((PoiMetadataExtracter) getExtracter()).setMimetypeLimits(mimetypeLimits);
 | |
|         
 | |
|         File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile("problemFootnotes.docx");
 | |
|         
 | |
|         Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
 | |
|         // construct a reader onto the source file
 | |
|         ContentReader sourceReader = new FileContentReader(sourceFile);
 | |
|         sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING);
 | |
|         
 | |
|         long startTime = System.currentTimeMillis();
 | |
| 
 | |
|         getExtracter().extract(sourceReader, properties);
 | |
|         
 | |
|         long extractionTime = System.currentTimeMillis() - startTime;
 | |
|         
 | |
|         assertTrue("Metadata extraction took (" + extractionTime + "ms) " +
 | |
|                 "but should have failed with a timeout at " + timeoutMs + "ms", 
 | |
|                 extractionTime < (timeoutMs + 100)); // bit of wiggle room for logging, cleanup, etc.
 | |
|         assertFalse("Reader was not closed", sourceReader.isChannelOpen());
 | |
|     }
 | |
| 
 | |
| //    /**
 | |
| //     * Test for MNT-11823: Upload of PPTX causes very high memory usage leading to system instability
 | |
| //     * 
 | |
| //     * @throws Exception
 | |
| //     */
 | |
| //    public void testProblemSlideShow() throws Exception
 | |
| //    {
 | |
| //        PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
 | |
| //        configureExtractorLimits(extractor, ALL_MIMETYPES_FILTER, TIMEOUT_FOR_QUICK_EXTRACTION);
 | |
| //
 | |
| //        File problemSlideShowFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_SLIDE_SHOW_DOCUMENT_NAME);
 | |
| //        ContentReader sourceReader = new FileContentReader(problemSlideShowFile);
 | |
| //        sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION);
 | |
| //
 | |
| //        Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
 | |
| //        extractor.extract(sourceReader, properties);
 | |
| //
 | |
| //        assertExtractedProperties(properties);
 | |
| //        assertFalse("Reader was not closed", sourceReader.isChannelOpen());
 | |
| //
 | |
| //        extractor.setPoiExtractPropertiesOnly(false);
 | |
| //        extractor.afterPropertiesSet();
 | |
| //        properties = new HashMap<QName, Serializable>();
 | |
| //        extractor.extract(sourceReader, properties);
 | |
| //
 | |
| //        assertFalse("Reader was not closed", sourceReader.isChannelOpen());
 | |
| //        assertTrue(("Extraction completed successfully but failure is expected! Invalid properties are: " + properties), (null == properties) || properties.isEmpty());
 | |
| //    }
 | |
| 
 | |
|     /**
 | |
|      * Configures timeout for given <code>extractor</code> and <code>mimetypeFilter</code>
 | |
|      * 
 | |
|      * @param extractor - {@link PoiMetadataExtracter} instance
 | |
|      * @param mimetypeFilter - {@link String} value which specifies mimetype filter for which timeout should be applied
 | |
|      * @param timeout - {@link Long} value which specifies timeout for <code>mimetypeFilter</code>
 | |
|      */
 | |
|     private void configureExtractorLimits(PoiMetadataExtracter extractor, String mimetypeFilter, long timeout)
 | |
|     {
 | |
|         MetadataExtracterLimits limits = new MetadataExtracterLimits();
 | |
|         limits.setTimeoutMs(timeout);
 | |
|         HashMap<String, MetadataExtracterLimits> mimetypeLimits = new HashMap<String, MetadataExtracterLimits>(1);
 | |
|         mimetypeLimits.put(mimetypeFilter, limits);
 | |
|         extractor.setMimetypeLimits(mimetypeLimits);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Test for MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for Word office document
 | |
|      * 
 | |
|      * @throws Exception
 | |
|      */
 | |
|     public void testFootnotesLimitParameterUsingDefault() throws Exception
 | |
|     {
 | |
|         PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
 | |
| 
 | |
|         File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_FOOTNOTES_DOCUMENT_NAME);
 | |
|         ContentReader sourceReader = new FileContentReader(sourceFile);
 | |
|         sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING);
 | |
| 
 | |
|         Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
 | |
|         long startTime = System.currentTimeMillis();
 | |
|         extractor.extract(sourceReader, properties);
 | |
|         extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime;
 | |
| 
 | |
|         assertExtractedProperties(properties);
 | |
|         if (extractionTimeWithLargeFootnotesLimit != null)
 | |
|         {
 | |
|             assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
 | |
|         }
 | |
|     }
 | |
|     
 | |
|     
 | |
|     /**
 | |
|      * Test for MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for Word office document
 | |
|      * 
 | |
|      * @throws Exception
 | |
|      */
 | |
|     public void testFootnotesLimitParameterUsingLarge() throws Exception
 | |
|     {
 | |
|         PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter();
 | |
| 
 | |
|         File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_FOOTNOTES_DOCUMENT_NAME);
 | |
|         ContentReader sourceReader = new FileContentReader(sourceFile);
 | |
|         sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING);
 | |
| 
 | |
|         // Just let the extractor do the job...
 | |
|         extractor.setPoiFootnotesLimit(LARGE_FOOTNOTES_LIMIT);
 | |
|         extractor.afterPropertiesSet();
 | |
|         Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
 | |
|         long startTime = System.currentTimeMillis();
 | |
|         extractor.extract(sourceReader, properties);
 | |
|         extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime;
 | |
| 
 | |
|         assertExtractedProperties(properties);
 | |
|         if (extractionTimeWithDefaultFootnotesLimit != null)
 | |
|         {
 | |
|             assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Asserts extracted <code>properties</code>. At least {@link PoiMetadataExtracterTest#MINIMAL_EXPECTED_PROPERTIES_AMOUNT} properties are expected:
 | |
|      * {@link ContentModel#PROP_TITLE}, {@link ContentModel#PROP_AUTHOR} and {@link ContentModel#PROP_CREATED}
 | |
|      * 
 | |
|      * @param properties - {@link Map}<{@link QName}, {@link Serializable}> instance which contains all extracted properties
 | |
|      */
 | |
|     private void assertExtractedProperties(Map<QName, Serializable> properties)
 | |
|     {
 | |
|         assertNotNull("Properties were not extracted at all!", properties);
 | |
|         assertFalse("Extracted properties are empty!", properties.isEmpty());
 | |
|         assertTrue(("Expected 3 extracted properties but only " + properties.size() + " have been extracted!"), properties.size() >= MINIMAL_EXPECTED_PROPERTIES_AMOUNT);
 | |
|         assertTrue(("'" + ContentModel.PROP_TITLE + "' property is missing!"), properties.containsKey(ContentModel.PROP_TITLE));
 | |
|         assertTrue(("'" + ContentModel.PROP_AUTHOR + "' property is missing!"), properties.containsKey(ContentModel.PROP_AUTHOR));
 | |
|         assertTrue(("'" + ContentModel.PROP_CREATED + "' property is missing!"), properties.containsKey(ContentModel.PROP_CREATED));
 | |
|     }
 | |
| }
 |