MNT-15024. Implemented content.metadataExtractor.pdf.overwritePolicy property and get rid of a redundancy setter for the the overwritePolicy which causes a ambitious warning.

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@131900 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Martin Muller
2016-10-31 15:41:32 +00:00
parent 308da0f956
commit 94cebbefbc
4 changed files with 168 additions and 174 deletions

View File

@@ -341,6 +341,9 @@
</entry> </entry>
</map> </map>
</property> </property>
<property name="overwritePolicy">
<value>${content.metadataExtracter.pdf.overwritePolicy}</value>
</property>
</bean> </bean>
<bean id="extracter.Poi" class="org.alfresco.repo.content.metadata.PoiMetadataExtracter" parent="baseMetadataExtracter"> <bean id="extracter.Poi" class="org.alfresco.repo.content.metadata.PoiMetadataExtracter" parent="baseMetadataExtracter">
<property name="poiFootnotesLimit" value="${content.transformer.Poi.poiFootnotesLimit}" /> <property name="poiFootnotesLimit" value="${content.transformer.Poi.poiFootnotesLimit}" />

View File

@@ -644,6 +644,9 @@ content.metadataExtracter.parseShapes=false
content.metadataExtracter.pdf.maxDocumentSizeMB=10 content.metadataExtracter.pdf.maxDocumentSizeMB=10
content.metadataExtracter.pdf.maxConcurrentExtractionsCount=5 content.metadataExtracter.pdf.maxConcurrentExtractionsCount=5
# The default overwrite policy for PdfBoxMetadataExtracter
content.metadataExtracter.pdf.overwritePolicy=PRAGMATIC
# Property to enable upgrade from 2.1-A # Property to enable upgrade from 2.1-A
V2.1-A.fixes.to.schema=0 V2.1-A.fixes.to.schema=0
#V2.1-A.fixes.to.schema=82 #V2.1-A.fixes.to.schema=82

View File

@@ -1,82 +1,82 @@
/* /*
* #%L * #%L
* Alfresco Repository * Alfresco Repository
* %% * %%
* Copyright (C) 2005 - 2016 Alfresco Software Limited * Copyright (C) 2005 - 2016 Alfresco Software Limited
* %% * %%
* This file is part of the Alfresco software. * This file is part of the Alfresco software.
* If the software was purchased under a paid Alfresco license, the terms of * If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is * the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms: * provided under the following open source license terms:
* *
* Alfresco is free software: you can redistribute it and/or modify * Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by * it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version. * (at your option) any later version.
* *
* Alfresco is distributed in the hope that it will be useful, * Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details. * GNU Lesser General Public License for more details.
* *
* You should have received a copy of the GNU Lesser General Public License * You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.repo.content.metadata; package org.alfresco.repo.content.metadata;
import java.io.InputStream; import java.io.InputStream;
import java.io.Serializable; import java.io.Serializable;
import java.lang.reflect.Array; import java.lang.reflect.Array;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Properties; import java.util.Properties;
import java.util.Set; import java.util.Set;
import java.util.StringTokenizer; import java.util.StringTokenizer;
import java.util.concurrent.Callable; import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.FutureTask; import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import javax.activation.MimeType; import javax.activation.MimeType;
import org.alfresco.api.AlfrescoPublicApi; import org.alfresco.api.AlfrescoPublicApi;
import org.alfresco.error.AlfrescoRuntimeException; import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.model.ContentModel; import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.StreamAwareContentReaderProxy; import org.alfresco.repo.content.StreamAwareContentReaderProxy;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition; import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
import org.alfresco.service.cmr.dictionary.DictionaryService; import org.alfresco.service.cmr.dictionary.DictionaryService;
import org.alfresco.service.cmr.dictionary.PropertyDefinition; import org.alfresco.service.cmr.dictionary.PropertyDefinition;
import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.MalformedNodeRefException; import org.alfresco.service.cmr.repository.MalformedNodeRefException;
import org.alfresco.service.cmr.repository.MimetypeService; import org.alfresco.service.cmr.repository.MimetypeService;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.cmr.repository.datatype.TypeConversionException; import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
import org.alfresco.service.namespace.InvalidQNameException; import org.alfresco.service.namespace.InvalidQNameException;
import org.alfresco.service.namespace.QName; import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.joda.time.DateTime; import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter; import org.joda.time.format.DateTimeFormatter;
import org.springframework.beans.factory.BeanNameAware; import org.springframework.beans.factory.BeanNameAware;
import org.springframework.context.ApplicationContext; import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware; import org.springframework.context.ApplicationContextAware;
import org.springframework.extensions.surf.util.ISO8601DateFormat; import org.springframework.extensions.surf.util.ISO8601DateFormat;
/** /**
* Support class for metadata extracters that support dynamic and config-driven * Support class for metadata extracters that support dynamic and config-driven
@@ -126,7 +126,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
private static final String PROP_DEFAULT_TIMEOUT = "content.metadataExtracter.default.timeoutMs"; private static final String PROP_DEFAULT_TIMEOUT = "content.metadataExtracter.default.timeoutMs";
public static final String PROPERTY_PREFIX_METADATA = "metadata."; public static final String PROPERTY_PREFIX_METADATA = "metadata.";
public static final String PROPERTY_COMPONENT_EXTRACT = ".extract."; public static final String PROPERTY_COMPONENT_EXTRACT = ".extract.";
public static final String PROPERTY_COMPONENT_EMBED = ".embed."; public static final String PROPERTY_COMPONENT_EMBED = ".embed.";
public static final int MEGABYTE_SIZE = 1048576; public static final int MEGABYTE_SIZE = 1048576;
protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class); protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class);
@@ -151,8 +151,8 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
private Properties properties; private Properties properties;
private Map<String, MetadataExtracterLimits> mimetypeLimits; private Map<String, MetadataExtracterLimits> mimetypeLimits;
private ExecutorService executorService; private ExecutorService executorService;
protected MetadataExtracterConfig metadataExtracterConfig; protected MetadataExtracterConfig metadataExtracterConfig;
private static final AtomicInteger CONCURRENT_EXTRACTIONS_COUNT = new AtomicInteger(0); private static final AtomicInteger CONCURRENT_EXTRACTIONS_COUNT = new AtomicInteger(0);
/** /**
@@ -259,7 +259,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
* {@inheritDoc} * {@inheritDoc}
* *
* @see #setSupportedMimetypes(Collection) * @see #setSupportedMimetypes(Collection)
*/ */
@Override @Override
public boolean isSupported(String sourceMimetype) public boolean isSupported(String sourceMimetype)
{ {
@@ -271,7 +271,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
* *
* @see #setSupportedEmbedMimetypes(Collection) * @see #setSupportedEmbedMimetypes(Collection)
*/ */
@Override @Override
public boolean isEmbeddingSupported(String sourceMimetype) public boolean isEmbeddingSupported(String sourceMimetype)
{ {
if (supportedEmbedMimetypes == null) if (supportedEmbedMimetypes == null)
@@ -314,18 +314,6 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
this.overwritePolicy = overwritePolicy; this.overwritePolicy = overwritePolicy;
} }
/**
* Set the policy to use when existing values are encountered. Depending on how the extractor
* is called, this may not be relevant, i.e an empty map of existing properties may be passed
* in by the client code, which may follow its own overwrite strategy.
*
* @param overwritePolicyStr the policy to apply when there are existing system properties
*/
public void setOverwritePolicy(String overwritePolicyStr)
{
this.overwritePolicy = OverwritePolicy.valueOf(overwritePolicyStr);
}
/** /**
* Set whether the extractor should discard metadata that fails to convert to the target type * Set whether the extractor should discard metadata that fails to convert to the target type
* defined in the data dictionary model. This is <tt>true</tt> by default i.e. if the data * defined in the data dictionary model. This is <tt>true</tt> by default i.e. if the data
@@ -1152,7 +1140,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
/** /**
* {@inheritDoc} * {@inheritDoc}
*/ */
@Override @Override
public final Map<QName, Serializable> extract(ContentReader reader, Map<QName, Serializable> destination) public final Map<QName, Serializable> extract(ContentReader reader, Map<QName, Serializable> destination)
{ {
return extract(reader, this.overwritePolicy, destination, this.mapping); return extract(reader, this.overwritePolicy, destination, this.mapping);
@@ -1161,7 +1149,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
/** /**
* {@inheritDoc} * {@inheritDoc}
*/ */
@Override @Override
public final Map<QName, Serializable> extract( public final Map<QName, Serializable> extract(
ContentReader reader, ContentReader reader,
OverwritePolicy overwritePolicy, OverwritePolicy overwritePolicy,
@@ -1173,7 +1161,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
/** /**
* {@inheritDoc} * {@inheritDoc}
*/ */
@Override @Override
public Map<QName, Serializable> extract( public Map<QName, Serializable> extract(
ContentReader reader, ContentReader reader,
OverwritePolicy overwritePolicy, OverwritePolicy overwritePolicy,
@@ -1225,12 +1213,12 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
logger.debug("Extracted Metadata from " + reader + "\n Found: " + logger.debug("Extracted Metadata from " + reader + "\n Found: " +
rawMetadata + "\n Mapped and Accepted: " + changedProperties); rawMetadata + "\n Mapped and Accepted: " + changedProperties);
} }
} }
catch (LimitExceededException e) catch (LimitExceededException e)
{ {
logger.warn("Metadata extraction rejected: \n" + logger.warn("Metadata extraction rejected: \n" +
" Extracter: " + this + "\n" + " Extracter: " + this + "\n" +
" Reason: " + e.getMessage()); " Reason: " + e.getMessage());
} }
catch (Throwable e) catch (Throwable e)
{ {
@@ -1303,7 +1291,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
/** /**
* {@inheritDoc} * {@inheritDoc}
*/ */
@Override @Override
public final void embed( public final void embed(
Map<QName, Serializable> properties, Map<QName, Serializable> properties,
ContentReader reader, ContentReader reader,
@@ -1980,7 +1968,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
* Gets the metadata extracter limits for the given mimetype. * Gets the metadata extracter limits for the given mimetype.
* <p> * <p>
* A specific match for the given mimetype is tried first and * A specific match for the given mimetype is tried first and
* if none is found a wildcard of "*" is tried, if still not found * if none is found a wildcard of "*" is tried, if still not found
* defaults value will be used * defaults value will be used
* *
* @param mimetype String * @param mimetype String
@@ -1997,11 +1985,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
if (limits == null) if (limits == null)
{ {
limits = mimetypeLimits.get("*"); limits = mimetypeLimits.get("*");
} }
if (limits == null) if (limits == null)
{ {
limits = new MetadataExtracterLimits(); limits = new MetadataExtracterLimits();
} }
return limits; return limits;
} }
@@ -2045,19 +2033,19 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
{ {
super(cause); super(cause);
} }
} }
/** /**
* Exception wrapper to handle exceeded limits imposed by {@link MetadataExtracterLimits} * Exception wrapper to handle exceeded limits imposed by {@link MetadataExtracterLimits}
* {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader, MetadataExtracterLimits)} * {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader, MetadataExtracterLimits)}
*/ */
private class LimitExceededException extends Exception private class LimitExceededException extends Exception
{ {
private static final long serialVersionUID = 702554119174770130L; private static final long serialVersionUID = 702554119174770130L;
public LimitExceededException(String message) public LimitExceededException(String message)
{ {
super(message); super(message);
} }
} }
/** /**
@@ -2081,32 +2069,32 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
ContentReader reader, MetadataExtracterLimits limits) throws Throwable ContentReader reader, MetadataExtracterLimits limits) throws Throwable
{ {
FutureTask<Map<String, Serializable>> task = null; FutureTask<Map<String, Serializable>> task = null;
StreamAwareContentReaderProxy proxiedReader = null; StreamAwareContentReaderProxy proxiedReader = null;
if (reader.getSize() > limits.getMaxDocumentSizeMB() * MEGABYTE_SIZE) if (reader.getSize() > limits.getMaxDocumentSizeMB() * MEGABYTE_SIZE)
{ {
throw new LimitExceededException("Max doc size exceeded " + limits.getMaxDocumentSizeMB() + " MB"); throw new LimitExceededException("Max doc size exceeded " + limits.getMaxDocumentSizeMB() + " MB");
} }
synchronized (CONCURRENT_EXTRACTIONS_COUNT) synchronized (CONCURRENT_EXTRACTIONS_COUNT)
{ {
if (logger.isDebugEnabled()) if (logger.isDebugEnabled())
{ {
logger.debug("Concurrent extractions : " + CONCURRENT_EXTRACTIONS_COUNT.get()); logger.debug("Concurrent extractions : " + CONCURRENT_EXTRACTIONS_COUNT.get());
} }
if (CONCURRENT_EXTRACTIONS_COUNT.get() < limits.getMaxConcurrentExtractionsCount()) if (CONCURRENT_EXTRACTIONS_COUNT.get() < limits.getMaxConcurrentExtractionsCount())
{ {
int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.incrementAndGet(); int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.incrementAndGet();
if (logger.isDebugEnabled()) if (logger.isDebugEnabled())
{ {
logger.debug("New extraction accepted. Concurrent extractions : " + totalDocCount); logger.debug("New extraction accepted. Concurrent extractions : " + totalDocCount);
} }
} }
else else
{ {
throw new LimitExceededException("Reached concurrent extractions limit - " + limits.getMaxConcurrentExtractionsCount()); throw new LimitExceededException("Reached concurrent extractions limit - " + limits.getMaxConcurrentExtractionsCount());
} }
} }
try try
{ {
@@ -2140,13 +2128,13 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
} }
throw cause; throw cause;
} }
finally finally
{ {
int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.decrementAndGet(); int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.decrementAndGet();
if (logger.isDebugEnabled()) if (logger.isDebugEnabled())
{ {
logger.debug("Extraction finalized. Remaining concurrent extraction : " + totalDocCount); logger.debug("Extraction finalized. Remaining concurrent extraction : " + totalDocCount);
} }
} }
} }

View File

@@ -1,28 +1,28 @@
/* /*
* #%L * #%L
* Alfresco Repository * Alfresco Repository
* %% * %%
* Copyright (C) 2005 - 2016 Alfresco Software Limited * Copyright (C) 2005 - 2016 Alfresco Software Limited
* %% * %%
* This file is part of the Alfresco software. * This file is part of the Alfresco software.
* If the software was purchased under a paid Alfresco license, the terms of * If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is * the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms: * provided under the following open source license terms:
* *
* Alfresco is free software: you can redistribute it and/or modify * Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by * it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version. * (at your option) any later version.
* *
* Alfresco is distributed in the hope that it will be useful, * Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details. * GNU Lesser General Public License for more details.
* *
* You should have received a copy of the GNU Lesser General Public License * You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.repo.content.metadata.xml; package org.alfresco.repo.content.metadata.xml;
@@ -51,7 +51,7 @@ import org.alfresco.util.PropertyCheck;
* most appropriate of a given XML document. The chosen extracter is then asked * most appropriate of a given XML document. The chosen extracter is then asked
* to extract the values, passing through the * to extract the values, passing through the
* {@code MetadataExtracter.OverwritePolicy} as * {@code MetadataExtracter.OverwritePolicy} as
* {@linkplain #setOverwritePolicy(String)} on this instance. The overwrite * {@linkplain #setOverwritePolicy(org.alfresco.repo.content.metadata.MetadataExtracter.OverwritePolicy)} on this instance. The overwrite
* policy of the embedded extracters is not relevant unless they are used * policy of the embedded extracters is not relevant unless they are used
* separately in another context. * separately in another context.
* *