mirror of
				https://github.com/Alfresco/alfresco-community-repo.git
				synced 2025-10-22 15:12:38 +00:00 
			
		
		
		
	84058: Merged V4.2-BUG-FIX (4.2.4) to HEAD-BUG-FIX (5.0/Cloud)
      83799: MNT-12238: Merged DEV 4.2-BUG-FIX (4.2.4) to V4.2-BUG-FIX (4.2.4)
         MNT-12238: Merged 4.1-BUG-FIX (4.1.10) to V4.2-BUG-FIX (4.2.4)
            80291: Merged V4.1.6 (4.1.6.21) to V4.1-BUG-FIX (4.1.10)
               77378: Merged DEV PATCHES/V4.1.6 (19) to PATCHES/V4.1.6 (20)
                  76649: MNT-11823: Upload of PPTX causes very high memory usage leading to system instability
                     - Patch from MNT-577 has been combined with new changes to avoid hanging of analyzing complicated PPTX documents. The fix just disables reading the entire contents of the complicated document. POI metadata extractor may be switched to standard behavior or reconfigured, using the following new properties: content.transformer.Poi.poiFootnotesLimit, content.transformer.Poi.poiExtractPropertiesOnly and content-services-context.xml/extracter.Poi/poiAllowableXslfRelationshipTypes
                  77379: MNT-11823: Upload of PPTX causes very high memory usage leading to system instability
                     Test and the test data for MNT-577 have been added. Test for MNT-11823 has also been added. But this test is commented because the test data (appropriate PPTX document) is not currently available. Getters for POI specific properties have been added to 'PoiMetadataExtracter' for tests. Also 'afterPropertiesSet()' logic has been a bit modified to allow setting 'false' value for 'poiExtractPropertiesOnly' parameter
                  77561: MNT-11823: Upload of PPTX causes very high memory usage leading to system instability
                     Fix for https://bamboo.alfresco.com/bamboo/browse/HF-PATCH416-126 build failure. POI extractor and transformer properties of 'AlfrescoPoiPatchUtils' have been isolated from each other using context. Each extractor or transformer now has its own context or uses the default context. Properties of the default context allow parsing the entire contents of XLSF documents. And footnotes limit is 50. Property names have not been changed, but currently 'content-services-context.xml/extracter.Poi/poiAllowableXslfRelationshipTypes=null' does not lead to 'content.transformer.Poi.poiExtractPropertiesOnly=false'. I. e., this list may be empty. 'PoiMetadataExtracterTest' test has been modified in accordance with the introduced changes. 'poi-OOXML-3.9-beta1-20121109.jar' has been renamed to 'poi-OOXML-3.9-beta1-20121109-patched.jar'
                  79180: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability
                     Timeout mechanism has been added to content transformers. Timeout configuration options have been added. Also mechanism to close streams after 'TimoutException' has been added to transformers and metadata extractors. Also timeout mechanism for input streams has been enabled in 'TikaPoweredContentTransformer'
                  79268: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability
                     Fix for the https://bamboo.alfresco.com/bamboo/browse/HF-PATCH416-133 build failure and comments of the review https://fisheye.alfresco.com/cru/CR-100#CFR-1184. The new test has been added into 'PoiOOXMLContentTransformerTest.testMnt12043()' to check out the newly added timeout mechanism
                  79290: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability
                     - Removed methods and properties that are no longer needed
                  79327: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability
                     - Increased ADDITIONAL_PROCESSING_TIME to 1500ms to try and avoid a new intermittent test failure.
      83885: MNT-12238 Bring Maven POM file in sync with latest patched version of poi-ooxml
git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@84627 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
		
	
		
			
				
	
	
		
			152 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Java
		
	
	
	
	
	
			
		
		
	
	
			152 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Java
		
	
	
	
	
	
| /*
 | |
|  * Copyright (C) 2005-2014 Alfresco Software Limited.
 | |
|  *
 | |
|  * This file is part of Alfresco
 | |
|  *
 | |
|  * Alfresco is free software: you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU Lesser General Public License as published by
 | |
|  * the Free Software Foundation, either version 3 of the License, or
 | |
|  * (at your option) any later version.
 | |
|  *
 | |
|  * Alfresco is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  * GNU Lesser General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Lesser General Public License
 | |
|  * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 | |
|  */
 | |
| package org.alfresco.repo.content.metadata;
 | |
| 
 | |
| import java.util.ArrayList;
 | |
| import java.util.Set;
 | |
| 
 | |
| import org.alfresco.repo.content.MimetypeMap;
 | |
| import org.apache.commons.logging.Log;
 | |
| import org.apache.commons.logging.LogFactory;
 | |
| import org.apache.poi.patch.AlfrescoPoiPatchUtils;
 | |
| import org.apache.tika.parser.Parser;
 | |
| import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
 | |
| import org.springframework.beans.factory.InitializingBean;
 | |
| 
 | |
| /**
 | |
|  * POI-based metadata extractor for Office 07 documents.
 | |
|  * See http://poi.apache.org/ for information on POI.
 | |
|  * <pre>
 | |
|  *   <b>author:</b>                 --      cm:author
 | |
|  *   <b>title:</b>                  --      cm:title
 | |
|  *   <b>subject:</b>                --      cm:description
 | |
|  *   <b>created:</b>                --      cm:created
 | |
|  *   <b>Any custom property:</b>    --      [not mapped]
 | |
|  * </pre>
 | |
|  * 
 | |
|  * Uses Apache Tika<br />
 | |
|  * <br />
 | |
|  * Configures {@link AlfrescoPoiPatchUtils} to resolve the following issues:
 | |
|  * <ul>
 | |
|  * <li><a href="https://issues.alfresco.com/jira/browse/MNT-577">MNT-577</a></li>
 | |
|  * <li><a href="https://issues.alfresco.com/jira/browse/MNT-11823">MNT-11823</a></li>
 | |
|  * </ul>
 | |
|  * 
 | |
|  * @author Nick Burch
 | |
|  * @author Neil McErlean
 | |
|  * @author Dmitry Velichkevich
 | |
|  */
 | |
| public class PoiMetadataExtracter extends TikaPoweredMetadataExtracter implements InitializingBean
 | |
| {
 | |
|     protected static Log logger = LogFactory.getLog(PoiMetadataExtracter.class);
 | |
| 
 | |
|     public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes( 
 | |
|        new String[] {MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING,
 | |
|     	               MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET,
 | |
|     	               MimetypeMap.MIMETYPE_OPENXML_PRESENTATION},
 | |
|     	 new OOXMLParser() 
 | |
|     );
 | |
| 
 | |
|     private Integer poiFootnotesLimit;
 | |
| 
 | |
|     private Boolean poiExtractPropertiesOnly = false;
 | |
| 
 | |
|     private Set<String> poiAllowableXslfRelationshipTypes;
 | |
| 
 | |
|     public PoiMetadataExtracter()
 | |
|     {
 | |
|         super(PoiMetadataExtracter.class.getName(), SUPPORTED_MIMETYPES);
 | |
|     }
 | |
| 
 | |
|     @Override
 | |
|     protected Parser getParser() 
 | |
|     {
 | |
|         return new OOXMLParser();
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for Word office document <br />
 | |
|      * <br />
 | |
|      * 
 | |
|      * @param poiFootnotesLimit - {@link Integer} value which specifies limit of amount of footnotes of XWPF documents
 | |
|      */
 | |
|     public void setPoiFootnotesLimit(Integer poiFootnotesLimit)
 | |
|     {
 | |
|         this.poiFootnotesLimit = poiFootnotesLimit;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * MNT-11823: Upload of PPTX causes very high memory usage leading to system instability<br />
 | |
|      * <br />
 | |
|      * 
 | |
|      * @param poiExtractPropertiesOnly - {@link Boolean} value which indicates that POI extractor must avoid building of the full document parts hierarchy and reading content of
 | |
|      *        the parts
 | |
|      */
 | |
|     public void setPoiExtractPropertiesOnly(Boolean poiExtractPropertiesOnly)
 | |
|     {
 | |
|         this.poiExtractPropertiesOnly = poiExtractPropertiesOnly;
 | |
|     }
 | |
| 
 | |
|     public Boolean isPoiExtractPropertiesOnly()
 | |
|     {
 | |
|         return (poiExtractPropertiesOnly == null) ? (false) : (poiExtractPropertiesOnly);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * MNT-11823: Upload of PPTX causes very high memory usage leading to system instability<br />
 | |
|      * <br />
 | |
|      * 
 | |
|      * @param poiAllowableXslfRelationshipTypes - {@link Set}<{@link String}> instance which determines the list of allowable relationship types for traversing during
 | |
|      *        analyzing of XSLF document
 | |
|      */
 | |
|     public void setPoiAllowableXslfRelationshipTypes(Set<String> poiAllowableXslfRelationshipTypes)
 | |
|     {
 | |
|         this.poiAllowableXslfRelationshipTypes = poiAllowableXslfRelationshipTypes;
 | |
|     }
 | |
| 
 | |
|     public Set<String> getPoiAllowableXslfRelationshipTypes()
 | |
|     {
 | |
|         return poiAllowableXslfRelationshipTypes;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * MNT-11823: Upload of PPTX causes very high memory usage leading to system instability<br />
 | |
|      * <br />
 | |
|      * Initialization of {@link AlfrescoPoiPatchUtils} properties for {@link PoiMetadataExtracter#getExtractorContext()} context
 | |
|      */
 | |
|     @Override
 | |
|     public void afterPropertiesSet() throws Exception
 | |
|     {
 | |
|         if (null == poiExtractPropertiesOnly)
 | |
|         {
 | |
|             poiExtractPropertiesOnly = false;
 | |
|         }
 | |
| 
 | |
|         String context = getExtractorContext();
 | |
| 
 | |
|         if (null != poiFootnotesLimit)
 | |
|         {
 | |
|             AlfrescoPoiPatchUtils.setPoiFootnotesLimit(context, poiFootnotesLimit);
 | |
|         }
 | |
| 
 | |
|         AlfrescoPoiPatchUtils.setPoiExtractPropertiesOnly(context, poiExtractPropertiesOnly);
 | |
|         AlfrescoPoiPatchUtils.setPoiAllowableXslfRelationshipTypes(context, poiAllowableXslfRelationshipTypes);
 | |
|     }
 | |
| }
 |