/* * Copyright (C) 2005-2014 Alfresco Software Limited. * * This file is part of Alfresco * * Alfresco is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Alfresco is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Alfresco. If not, see . */ package org.alfresco.repo.content.metadata; import java.io.File; import java.io.Serializable; import java.util.HashMap; import java.util.Map; import org.alfresco.model.ContentModel; import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.filestore.FileContentReader; import org.alfresco.repo.content.transform.AbstractContentTransformerTest; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.namespace.QName; /** * @see org.alfresco.repo.content.metadata.PoiMetadataExtracter * * @author Neil McErlean * @author Dmitry Velichkevich */ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest { private static final int MINIMAL_EXPECTED_PROPERTIES_AMOUNT = 3; private static final int IGNORABLE_TIMEOUT = -1; // private static final int TIMEOUT_FOR_QUICK_EXTRACTION = 2000; private static final int DEFAULT_FOOTNOTES_LIMIT = 50; private static final int LARGE_FOOTNOTES_LIMIT = 25000; private static final String ALL_MIMETYPES_FILTER = "*"; private static final String PROBLEM_FOOTNOTES_DOCUMENT_NAME = "problemFootnotes2.docx"; // private static final String PROBLEM_SLIDE_SHOW_DOCUMENT_NAME = "problemSlideShow.pptx"; private static final String EXTRACTOR_POI_BEAN_NAME = "extracter.Poi"; private PoiMetadataExtracter extracter; @Override public void setUp() throws Exception { super.setUp(); extracter = new PoiMetadataExtracter(); extracter.setDictionaryService(dictionaryService); resetPoiConfigurationToDefault(); extracter.register(); } @Override protected void tearDown() throws Exception { resetPoiConfigurationToDefault(); super.tearDown(); } /** * Resets POI library configuration to default. Sets allowable XSLF relationship types and footnotes limit as per 'extracter.Poi' bean configuration * * @throws Exception */ private void resetPoiConfigurationToDefault() throws Exception { PoiMetadataExtracter configuredExtractor = (PoiMetadataExtracter) ctx.getBean(EXTRACTOR_POI_BEAN_NAME); extracter.setPoiExtractPropertiesOnly(true); extracter.setPoiFootnotesLimit(DEFAULT_FOOTNOTES_LIMIT); extracter.setPoiAllowableXslfRelationshipTypes(configuredExtractor.getPoiAllowableXslfRelationshipTypes()); extracter.afterPropertiesSet(); } @Override protected MetadataExtracter getExtracter() { return extracter; } public void testSupports() throws Exception { for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES) { boolean supports = extracter.isSupported(mimetype); assertTrue("Mimetype should be supported: " + mimetype, supports); } } public void testOffice2007Extraction() throws Exception { for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES) { testExtractFromMimetype(mimetype); } } @Override protected boolean skipDescriptionCheck(String mimetype) { // Our 3 OpenOffice 07 quick files have no description properties. return true; } @Override protected void testFileSpecificMetadata(String mimetype, Map properties) { // This test class is testing 3 files: quick.docx, quick.xlsx & quick.pptx. // Their created times are hard-coded here for checking. // Of course this means that if the files are updated, the test will break // but those files are rarely modified - only added to. if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype)) { checkFileCreationDate(mimetype, properties, "2010-01-06T17:32:00.000Z"); } else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype)) { checkFileCreationDate(mimetype, properties, "1996-10-14T23:33:28.000Z"); } else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype)) { // Extraordinary! This document predates Isaac Newton's Principia Mathematica by almost a century. ;) checkFileCreationDate(mimetype, properties, "1601-01-01T00:00:00.000Z"); } } private void checkFileCreationDate(String mimetype, Map properties, String date) { assertEquals("Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, date, DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED))); } /** * Tests that metadata extraction from a somewhat corrupt file with several * thousand footnotes times out properly. * * @throws Exception */ public void testProblemFootnotes() throws Exception { long timeoutMs = 2000; MetadataExtracterLimits limits = new MetadataExtracterLimits(); limits.setTimeoutMs(timeoutMs); HashMap mimetypeLimits = new HashMap(1); mimetypeLimits.put(ALL_MIMETYPES_FILTER, limits); ((PoiMetadataExtracter) getExtracter()).setMimetypeLimits(mimetypeLimits); File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile("problemFootnotes.docx"); Map properties = new HashMap(); // construct a reader onto the source file ContentReader sourceReader = new FileContentReader(sourceFile); sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING); long startTime = System.currentTimeMillis(); getExtracter().extract(sourceReader, properties); long extractionTime = System.currentTimeMillis() - startTime; assertTrue("Metadata extraction took (" + extractionTime + "ms) " + "but should have failed with a timeout at " + timeoutMs + "ms", extractionTime < (timeoutMs + 100)); // bit of wiggle room for logging, cleanup, etc. assertFalse("Reader was not closed", sourceReader.isChannelOpen()); } // /** // * Test for MNT-11823: Upload of PPTX causes very high memory usage leading to system instability // * // * @throws Exception // */ // public void testProblemSlideShow() throws Exception // { // PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter(); // configureExtractorLimits(extractor, ALL_MIMETYPES_FILTER, TIMEOUT_FOR_QUICK_EXTRACTION); // // File problemSlideShowFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_SLIDE_SHOW_DOCUMENT_NAME); // ContentReader sourceReader = new FileContentReader(problemSlideShowFile); // sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION); // // Map properties = new HashMap(); // extractor.extract(sourceReader, properties); // // assertExtractedProperties(properties); // assertFalse("Reader was not closed", sourceReader.isChannelOpen()); // // extractor.setPoiExtractPropertiesOnly(false); // extractor.afterPropertiesSet(); // properties = new HashMap(); // extractor.extract(sourceReader, properties); // // assertFalse("Reader was not closed", sourceReader.isChannelOpen()); // assertTrue(("Extraction completed successfully but failure is expected! Invalid properties are: " + properties), (null == properties) || properties.isEmpty()); // } /** * Configures timeout for given extractor and mimetypeFilter * * @param extractor - {@link PoiMetadataExtracter} instance * @param mimetypeFilter - {@link String} value which specifies mimetype filter for which timeout should be applied * @param timeout - {@link Long} value which specifies timeout for mimetypeFilter */ private void configureExtractorLimits(PoiMetadataExtracter extractor, String mimetypeFilter, long timeout) { MetadataExtracterLimits limits = new MetadataExtracterLimits(); limits.setTimeoutMs(timeout); HashMap mimetypeLimits = new HashMap(1); mimetypeLimits.put(mimetypeFilter, limits); extractor.setMimetypeLimits(mimetypeLimits); } /** * Test for MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for Word office document * * @throws Exception */ public void testFootnotesLimitParameterUsing() throws Exception { PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter(); File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_FOOTNOTES_DOCUMENT_NAME); ContentReader sourceReader = new FileContentReader(sourceFile); sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING); Map properties = new HashMap(); long startTime = System.currentTimeMillis(); extractor.extract(sourceReader, properties); long extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime; assertExtractedProperties(properties); assertFalse("Reader was not closed", sourceReader.isChannelOpen()); // Just let the extractor do the job... configureExtractorLimits(extractor, ALL_MIMETYPES_FILTER, IGNORABLE_TIMEOUT); extractor.setPoiFootnotesLimit(LARGE_FOOTNOTES_LIMIT); extractor.afterPropertiesSet(); properties = new HashMap(); startTime = System.currentTimeMillis(); extractor.extract(sourceReader, properties); long extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime; assertExtractedProperties(properties); assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit); assertFalse("Reader was not closed", sourceReader.isChannelOpen()); } /** * Asserts extracted properties. At least {@link PoiMetadataExtracterTest#MINIMAL_EXPECTED_PROPERTIES_AMOUNT} properties are expected: * {@link ContentModel#PROP_TITLE}, {@link ContentModel#PROP_AUTHOR} and {@link ContentModel#PROP_CREATED} * * @param properties - {@link Map}<{@link QName}, {@link Serializable}> instance which contains all extracted properties */ private void assertExtractedProperties(Map properties) { assertNotNull("Properties were not extracted at all!", properties); assertFalse("Extracted properties are empty!", properties.isEmpty()); assertTrue(("Expected 3 extracted properties but only " + properties.size() + " have been extracted!"), properties.size() >= MINIMAL_EXPECTED_PROPERTIES_AMOUNT); assertTrue(("'" + ContentModel.PROP_TITLE + "' property is missing!"), properties.containsKey(ContentModel.PROP_TITLE)); assertTrue(("'" + ContentModel.PROP_AUTHOR + "' property is missing!"), properties.containsKey(ContentModel.PROP_AUTHOR)); assertTrue(("'" + ContentModel.PROP_CREATED + "' property is missing!"), properties.containsKey(ContentModel.PROP_CREATED)); } }