MNT-16709 : Metadata extraction on 200MB PDF file causes large heap utilization

- added concurrent extraction limit
   - added max document size limit

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@131709 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Andreea Dragoi
2016-10-24 12:13:50 +00:00
parent 2b4db66fbe
commit c95cbaccd9
8 changed files with 282 additions and 94 deletions

View File

@@ -45,13 +45,16 @@ import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import org.alfresco.api.AlfrescoPublicApi;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.model.ContentModel;
import javax.activation.MimeType;
import org.alfresco.api.AlfrescoPublicApi;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.StreamAwareContentReaderProxy;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
import org.alfresco.service.cmr.dictionary.DictionaryService;
@@ -123,7 +126,8 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
private static final String PROP_DEFAULT_TIMEOUT = "content.metadataExtracter.default.timeoutMs";
public static final String PROPERTY_PREFIX_METADATA = "metadata.";
public static final String PROPERTY_COMPONENT_EXTRACT = ".extract.";
public static final String PROPERTY_COMPONENT_EMBED = ".embed.";
public static final String PROPERTY_COMPONENT_EMBED = ".embed.";
public static final int MEGABYTE_SIZE = 1048576;
protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class);
@@ -147,7 +151,9 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
private Properties properties;
private Map<String, MetadataExtracterLimits> mimetypeLimits;
private ExecutorService executorService;
protected MetadataExtracterConfig metadataExtracterConfig;
protected MetadataExtracterConfig metadataExtracterConfig;
private static final AtomicInteger CONCURRENT_EXTRACTIONS_COUNT = new AtomicInteger(0);
/**
* Default constructor. If this is called, then {@link #isSupported(String)} should
@@ -1219,6 +1225,12 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
logger.debug("Extracted Metadata from " + reader + "\n Found: " +
rawMetadata + "\n Mapped and Accepted: " + changedProperties);
}
}
catch (LimitExceededException e)
{
logger.warn("Metadata extraction rejected: \n" +
" Extracter: " + this + "\n" +
" Reason: " + e.getMessage());
}
catch (Throwable e)
{
@@ -1968,23 +1980,29 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
* Gets the metadata extracter limits for the given mimetype.
* <p>
* A specific match for the given mimetype is tried first and
* if none is found a wildcard of "*" is tried.
* if none is found a wildcard of "*" is tried, if still not found
* defaults value will be used
*
* @param mimetype String
* @return the found limits or null
* @return the found limits or default values
*/
protected MetadataExtracterLimits getLimits(String mimetype)
{
if (mimetypeLimits == null)
{
return null;
return new MetadataExtracterLimits();
}
MetadataExtracterLimits limits = null;
limits = mimetypeLimits.get(mimetype);
if (limits == null)
{
limits = mimetypeLimits.get("*");
}
}
if (limits == null)
{
limits = new MetadataExtracterLimits();
}
return limits;
}
@@ -2027,6 +2045,19 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
{
super(cause);
}
}
/**
* Exception wrapper to handle exceeded limits imposed by {@link MetadataExtracterLimits}
* {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader, MetadataExtracterLimits)}
*/
private class LimitExceededException extends Exception
{
private static final long serialVersionUID = 702554119174770130L;
public LimitExceededException(String message)
{
super(message);
}
}
/**
@@ -2049,12 +2080,34 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
private Map<String, Serializable> extractRaw(
ContentReader reader, MetadataExtracterLimits limits) throws Throwable
{
if (limits == null || limits.getTimeoutMs() == -1)
{
return extractRaw(reader);
}
FutureTask<Map<String, Serializable>> task = null;
StreamAwareContentReaderProxy proxiedReader = null;
StreamAwareContentReaderProxy proxiedReader = null;
if (reader.getSize() > limits.getMaxDocumentSizeMB() * MEGABYTE_SIZE)
{
throw new LimitExceededException("Max doc size exceeded " + limits.getMaxDocumentSizeMB() + " MB");
}
synchronized (CONCURRENT_EXTRACTIONS_COUNT)
{
if (logger.isDebugEnabled())
{
logger.debug("Concurrent extractions : " + CONCURRENT_EXTRACTIONS_COUNT.get());
}
if (CONCURRENT_EXTRACTIONS_COUNT.get() < limits.getMaxConcurrentExtractionsCount())
{
int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.incrementAndGet();
if (logger.isDebugEnabled())
{
logger.debug("New extraction accepted. Concurrent extractions : " + totalDocCount);
}
}
else
{
throw new LimitExceededException("Reached concurrent extractions limit - " + limits.getMaxConcurrentExtractionsCount());
}
}
try
{
proxiedReader = new StreamAwareContentReaderProxy(reader);
@@ -2087,6 +2140,14 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
}
throw cause;
}
finally
{
int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.decrementAndGet();
if (logger.isDebugEnabled())
{
logger.debug("Extraction finalized. Remaining concurrent extraction : " + totalDocCount);
}
}
}
/**

View File

@@ -29,15 +29,17 @@ import org.alfresco.api.AlfrescoPublicApi;
/**
* Represents maximum values (that result in exceptions if exceeded) or
* limits on values (that result in EOF (End Of File) being returned
* early). The only current option is for elapsed time.
* limits on values (that result in EOF (End Of File) being returned early).
* The current options are elapsed time, document size and concurrent extractions limit.
*
* @author Ray Gauss II
*/
@AlfrescoPublicApi
public class MetadataExtracterLimits
{
private long timeoutMs = -1;
private long timeoutMs = Long.MAX_VALUE;
private double maxDocumentSizeMB = Double.MAX_VALUE;
private int maxConcurrentExtractionsCount = Integer.MAX_VALUE;
/**
* Gets the time in milliseconds after which the metadata extracter will be stopped.
@@ -57,6 +59,44 @@ public class MetadataExtracterLimits
public void setTimeoutMs(long timeoutMs)
{
this.timeoutMs = timeoutMs;
}
/**
* Gets the maximum size(MB) allowed for a transformation
*
* @return maximum size
*/
public double getMaxDocumentSizeMB()
{
return maxDocumentSizeMB;
}
/**
* Sets the maximum size(MB) allowed for a transformation
*
* @param maxDocumentSizeMB
*/
public void setMaxDocumentSizeMB(double maxDocumentSizeMB)
{
this.maxDocumentSizeMB = maxDocumentSizeMB;
}
/**
* Sets the maximum number of allowed concurrent extractions
*
* @param maxConcurrentExtractionsCount
*/
public void setMaxConcurrentExtractionsCount(int maxConcurrentExtractionsCount)
{
this.maxConcurrentExtractionsCount = maxConcurrentExtractionsCount;
}
/**
* Gets the maximum count of allowed concurrent extractions
*
* @return maximum count
*/
public int getMaxConcurrentExtractionsCount()
{
return maxConcurrentExtractionsCount;
}
}