mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Merged HEAD-BUG-FIX (5.0/Cloud) to HEAD (5.0/Cloud)
84058: Merged V4.2-BUG-FIX (4.2.4) to HEAD-BUG-FIX (5.0/Cloud) 83799: MNT-12238: Merged DEV 4.2-BUG-FIX (4.2.4) to V4.2-BUG-FIX (4.2.4) MNT-12238: Merged 4.1-BUG-FIX (4.1.10) to V4.2-BUG-FIX (4.2.4) 80291: Merged V4.1.6 (4.1.6.21) to V4.1-BUG-FIX (4.1.10) 77378: Merged DEV PATCHES/V4.1.6 (19) to PATCHES/V4.1.6 (20) 76649: MNT-11823: Upload of PPTX causes very high memory usage leading to system instability - Patch from MNT-577 has been combined with new changes to avoid hanging of analyzing complicated PPTX documents. The fix just disables reading the entire contents of the complicated document. POI metadata extractor may be switched to standard behavior or reconfigured, using the following new properties: content.transformer.Poi.poiFootnotesLimit, content.transformer.Poi.poiExtractPropertiesOnly and content-services-context.xml/extracter.Poi/poiAllowableXslfRelationshipTypes 77379: MNT-11823: Upload of PPTX causes very high memory usage leading to system instability Test and the test data for MNT-577 have been added. Test for MNT-11823 has also been added. But this test is commented because the test data (appropriate PPTX document) is not currently available. Getters for POI specific properties have been added to 'PoiMetadataExtracter' for tests. Also 'afterPropertiesSet()' logic has been a bit modified to allow setting 'false' value for 'poiExtractPropertiesOnly' parameter 77561: MNT-11823: Upload of PPTX causes very high memory usage leading to system instability Fix for https://bamboo.alfresco.com/bamboo/browse/HF-PATCH416-126 build failure. POI extractor and transformer properties of 'AlfrescoPoiPatchUtils' have been isolated from each other using context. Each extractor or transformer now has its own context or uses the default context. Properties of the default context allow parsing the entire contents of XLSF documents. And footnotes limit is 50. Property names have not been changed, but currently 'content-services-context.xml/extracter.Poi/poiAllowableXslfRelationshipTypes=null' does not lead to 'content.transformer.Poi.poiExtractPropertiesOnly=false'. I. e., this list may be empty. 'PoiMetadataExtracterTest' test has been modified in accordance with the introduced changes. 'poi-OOXML-3.9-beta1-20121109.jar' has been renamed to 'poi-OOXML-3.9-beta1-20121109-patched.jar' 79180: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability Timeout mechanism has been added to content transformers. Timeout configuration options have been added. Also mechanism to close streams after 'TimoutException' has been added to transformers and metadata extractors. Also timeout mechanism for input streams has been enabled in 'TikaPoweredContentTransformer' 79268: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability Fix for the https://bamboo.alfresco.com/bamboo/browse/HF-PATCH416-133 build failure and comments of the review https://fisheye.alfresco.com/cru/CR-100#CFR-1184. The new test has been added into 'PoiOOXMLContentTransformerTest.testMnt12043()' to check out the newly added timeout mechanism 79290: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability - Removed methods and properties that are no longer needed 79327: MNT-12043: CLONE - Upload of PPTX causes very high memory usage leading to system instability - Increased ADDITIONAL_PROCESSING_TIME to 1500ms to try and avoid a new intermittent test failure. 83885: MNT-12238 Bring Maven POM file in sync with latest patched version of poi-ooxml git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@84627 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2013 Alfresco Software Limited.
|
||||
* Copyright (C) 2005-2014 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
@@ -19,12 +19,24 @@
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import org.alfresco.error.AlfrescoRuntimeException;
|
||||
import org.alfresco.repo.content.AbstractStreamAwareProxy;
|
||||
import org.alfresco.repo.content.StreamAwareContentReaderProxy;
|
||||
import org.alfresco.repo.content.StreamAwareContentWriterProxy;
|
||||
import org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentServiceTransientException;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptionLimits;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
@@ -42,11 +54,27 @@ import org.apache.commons.logging.LogFactory;
|
||||
public abstract class AbstractContentTransformer2 extends AbstractContentTransformerLimits
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(AbstractContentTransformer2.class);
|
||||
|
||||
|
||||
private ExecutorService executorService;
|
||||
|
||||
private ContentTransformerRegistry registry;
|
||||
private boolean registerTransformer;
|
||||
private boolean retryTransformOnDifferentMimeType;
|
||||
|
||||
/**
|
||||
* A flag that indicates that the transformer should be started in it own Thread so
|
||||
* that it may be interrupted rather than using the timeout in the Reader.
|
||||
* Need only be set for transformers that read their source data quickly but then
|
||||
* take a long time to process the data (such as {@link PoiOOXMLContentTransformer}.
|
||||
*/
|
||||
private Boolean useTimeoutThread = false;
|
||||
|
||||
/**
|
||||
* Extra time added the timeout when using a Thread for the transformation so that
|
||||
* a timeout from the Reader has a chance to happen first.
|
||||
*/
|
||||
private long additionalThreadTimout = 2000;
|
||||
|
||||
private static ThreadLocal<Integer> depth = new ThreadLocal<Integer>()
|
||||
{
|
||||
@Override
|
||||
@@ -209,8 +237,49 @@ public abstract class AbstractContentTransformer2 extends AbstractContentTransfo
|
||||
setReaderLimits(reader, writer, options);
|
||||
|
||||
// Transform
|
||||
transformInternal(reader, writer, options);
|
||||
|
||||
// MNT-12238: CLONE - CLONE - Upload of PPTX causes very high memory usage leading to system instability
|
||||
// Limiting transformation up to configured amount of milliseconds to avoid very high RAM consumption
|
||||
// and OOM during transforming problematic documents
|
||||
TransformationOptionLimits limits = getLimits(reader.getMimetype(), writer.getMimetype(), options);
|
||||
|
||||
long timeoutMs = limits.getTimeoutMs();
|
||||
if (!useTimeoutThread || (null == limits) || (-1 == timeoutMs))
|
||||
{
|
||||
transformInternal(reader, writer, options);
|
||||
}
|
||||
else
|
||||
{
|
||||
Future<?> submittedTask = null;
|
||||
StreamAwareContentReaderProxy proxiedReader = new StreamAwareContentReaderProxy(reader);
|
||||
StreamAwareContentWriterProxy proxiedWriter = new StreamAwareContentWriterProxy(writer);
|
||||
|
||||
try
|
||||
{
|
||||
submittedTask = getExecutorService().submit(new TransformInternalCallable(proxiedReader, proxiedWriter, options));
|
||||
submittedTask.get(timeoutMs + additionalThreadTimout, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
catch (TimeoutException e)
|
||||
{
|
||||
releaseResources(submittedTask, proxiedReader, proxiedWriter);
|
||||
throw new TimeoutException("Transformation failed due to timeout limit");
|
||||
}
|
||||
catch (InterruptedException e)
|
||||
{
|
||||
releaseResources(submittedTask, proxiedReader, proxiedWriter);
|
||||
throw new InterruptedException("Transformation failed, because the thread of the transformation was interrupted");
|
||||
}
|
||||
catch (ExecutionException e)
|
||||
{
|
||||
Throwable cause = e.getCause();
|
||||
if (cause instanceof TransformInternalCallableException)
|
||||
{
|
||||
cause = ((TransformInternalCallableException) cause).getCause();
|
||||
}
|
||||
|
||||
throw cause;
|
||||
}
|
||||
}
|
||||
|
||||
// record time
|
||||
long after = System.currentTimeMillis();
|
||||
recordTime(sourceMimetype, targetMimetype, after - before);
|
||||
@@ -345,6 +414,31 @@ public abstract class AbstractContentTransformer2 extends AbstractContentTransfo
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancels <code>task</code> and closes content accessors
|
||||
*
|
||||
* @param task - {@link Future} task instance which specifies a transformation action
|
||||
* @param proxiedReader - {@link AbstractStreamAwareProxy} instance which represents channel closing mechanism for content reader
|
||||
* @param proxiedWriter - {@link AbstractStreamAwareProxy} instance which represents channel closing mechanism for content writer
|
||||
*/
|
||||
private void releaseResources(Future<?> task, AbstractStreamAwareProxy proxiedReader, AbstractStreamAwareProxy proxiedWriter)
|
||||
{
|
||||
if (null != task)
|
||||
{
|
||||
task.cancel(true);
|
||||
}
|
||||
|
||||
if (null != proxiedReader)
|
||||
{
|
||||
proxiedReader.release();
|
||||
}
|
||||
|
||||
if (null != proxiedWriter)
|
||||
{
|
||||
proxiedWriter.release();
|
||||
}
|
||||
}
|
||||
|
||||
public final void transform(
|
||||
ContentReader reader,
|
||||
ContentWriter writer,
|
||||
@@ -399,7 +493,104 @@ public abstract class AbstractContentTransformer2 extends AbstractContentTransfo
|
||||
transformerConfig.getStatistics(null, sourceMimetype, targetMimetype, true).recordTime(transformationTime);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the <code>ExecutorService</code> to be used for timeout-aware extraction.
|
||||
* <p>
|
||||
* If no <code>ExecutorService</code> has been defined a default of <code>Executors.newCachedThreadPool()</code> is used during {@link AbstractMappingMetadataExtracter#init()}.
|
||||
*
|
||||
* @return the defined or default <code>ExecutorService</code>
|
||||
*/
|
||||
protected ExecutorService getExecutorService()
|
||||
{
|
||||
if (null == executorService)
|
||||
{
|
||||
executorService = Executors.newCachedThreadPool();
|
||||
}
|
||||
|
||||
return executorService;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the <code>ExecutorService</code> to be used for timeout-aware transformation.
|
||||
*
|
||||
* @param executorService - {@link ExecutorService} instance for timeouts
|
||||
*/
|
||||
public void setExecutorService(ExecutorService executorService)
|
||||
{
|
||||
this.executorService = executorService;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@link Callable} wrapper for the {@link AbstractContentTransformer2#transformInternal(ContentReader, ContentWriter, TransformationOptions)} method to handle timeouts.
|
||||
*/
|
||||
private class TransformInternalCallable implements Callable<Void>
|
||||
{
|
||||
private ContentReader reader;
|
||||
|
||||
private ContentWriter writer;
|
||||
|
||||
private TransformationOptions options;
|
||||
|
||||
public TransformInternalCallable(ContentReader reader, ContentWriter writer, TransformationOptions options)
|
||||
{
|
||||
this.reader = reader;
|
||||
this.writer = writer;
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Void call() throws Exception
|
||||
{
|
||||
try
|
||||
{
|
||||
transformInternal(reader, writer, options);
|
||||
return null;
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
throw new TransformInternalCallableException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Exception wrapper to handle any {@link Throwable} from {@link AbstractContentTransformer2#transformInternal(ContentReader, ContentWriter, TransformationOptions)}
|
||||
*/
|
||||
private class TransformInternalCallableException extends Exception
|
||||
{
|
||||
private static final long serialVersionUID = 7740560508772740658L;
|
||||
|
||||
public TransformInternalCallableException(Throwable cause)
|
||||
{
|
||||
super(cause);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param useTimeoutThread - {@link Boolean} value which specifies timeout limiting mechanism for the current transformer
|
||||
* @see AbstractContentTransformer2#useTimeoutThread
|
||||
*/
|
||||
public void setUseTimeoutThread(Boolean useTimeoutThread)
|
||||
{
|
||||
if (null == useTimeoutThread)
|
||||
{
|
||||
useTimeoutThread = true;
|
||||
}
|
||||
|
||||
this.useTimeoutThread = useTimeoutThread;
|
||||
}
|
||||
|
||||
public void setAdditionalThreadTimout(long additionalThreadTimout)
|
||||
{
|
||||
this.additionalThreadTimout = additionalThreadTimout;
|
||||
}
|
||||
|
||||
public Boolean isTransformationLimitedInternally()
|
||||
{
|
||||
return useTimeoutThread;
|
||||
}
|
||||
|
||||
/**
|
||||
* Records an error and updates the average time as if the transformation took a
|
||||
* long time, so that it is less likely to be called again.
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
* Copyright (C) 2005-2014 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
@@ -48,6 +48,7 @@ public class PoiOOXMLContentTransformer extends TikaPoweredContentTransformer
|
||||
|
||||
public PoiOOXMLContentTransformer() {
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
setUseTimeoutThread(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
* Copyright (C) 2005-2014 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
@@ -108,6 +108,7 @@ public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
|
||||
public TikaAutoContentTransformer(TikaConfig tikaConfig)
|
||||
{
|
||||
super( buildMimeTypes(tikaConfig) );
|
||||
setUseTimeoutThread(true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2012 Alfresco Software Limited.
|
||||
* Copyright (C) 2005-2014 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
@@ -32,14 +32,12 @@ import javax.xml.transform.sax.TransformerHandler;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.repo.content.filestore.FileContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
@@ -69,6 +67,14 @@ public abstract class TikaPoweredContentTransformer extends AbstractContentTrans
|
||||
MimetypeMap.MIMETYPE_XHTML,
|
||||
MimetypeMap.MIMETYPE_XML});
|
||||
|
||||
private static final double MEGABYTES = 1024.0 * 1024.0;
|
||||
|
||||
private static final String USAGE_PATTERN = "Content transformation has completed:\n" +
|
||||
" Transformer: %s\n" +
|
||||
" Content Reader: %s\n" +
|
||||
" Memory (MB): Used/Total/Maximum - %f/%f/%f\n" +
|
||||
" Time Spent: %d ms";
|
||||
|
||||
protected List<String> sourceMimeTypes;
|
||||
protected DocumentSelector documentSelector;
|
||||
|
||||
@@ -225,22 +231,24 @@ public abstract class TikaPoweredContentTransformer extends AbstractContentTrans
|
||||
);
|
||||
}
|
||||
|
||||
// Prefer the File if available - it takes less memory to process
|
||||
InputStream is;
|
||||
if(reader instanceof FileContentReader)
|
||||
{
|
||||
is = TikaInputStream.get( ((FileContentReader)reader).getFile(), metadata );
|
||||
}
|
||||
else
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
}
|
||||
|
||||
InputStream is = reader.getContentInputStream();
|
||||
|
||||
long startTime = 0;
|
||||
try {
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
startTime = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
parser.parse(is, handler, metadata, context);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if(logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug(calculateMemoryAndTimeUsage(reader, startTime));
|
||||
}
|
||||
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (Throwable e) {}
|
||||
@@ -255,4 +263,13 @@ public abstract class TikaPoweredContentTransformer extends AbstractContentTrans
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String calculateMemoryAndTimeUsage(ContentReader reader, long startTime)
|
||||
{
|
||||
long endTime = System.currentTimeMillis();
|
||||
Runtime runtime = Runtime.getRuntime();
|
||||
long totalMemory = runtime.totalMemory();
|
||||
return String.format(USAGE_PATTERN, this.getClass().getName(), reader, (totalMemory - runtime.freeMemory()) / MEGABYTES, totalMemory / MEGABYTES, runtime.maxMemory()
|
||||
/ MEGABYTES, (endTime - startTime));
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user