diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml index 76a838bb52..b619f79fc6 100644 --- a/config/alfresco/content-services-context.xml +++ b/config/alfresco/content-services-context.xml @@ -341,6 +341,9 @@ + + ${content.metadataExtracter.pdf.overwritePolicy} + diff --git a/config/alfresco/repository.properties b/config/alfresco/repository.properties index 3570218307..0101978710 100644 --- a/config/alfresco/repository.properties +++ b/config/alfresco/repository.properties @@ -644,6 +644,9 @@ content.metadataExtracter.parseShapes=false content.metadataExtracter.pdf.maxDocumentSizeMB=10 content.metadataExtracter.pdf.maxConcurrentExtractionsCount=5 +# The default overwrite policy for PdfBoxMetadataExtracter +content.metadataExtracter.pdf.overwritePolicy=PRAGMATIC + # Property to enable upgrade from 2.1-A V2.1-A.fixes.to.schema=0 #V2.1-A.fixes.to.schema=82 diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java index 24963733f6..4dce2ab19e 100644 --- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java @@ -1,82 +1,82 @@ -/* - * #%L - * Alfresco Repository - * %% - * Copyright (C) 2005 - 2016 Alfresco Software Limited - * %% - * This file is part of the Alfresco software. - * If the software was purchased under a paid Alfresco license, the terms of - * the paid license agreement will prevail. Otherwise, the software is - * provided under the following open source license terms: - * - * Alfresco is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Alfresco is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with Alfresco. If not, see . - * #L% - */ +/* + * #%L + * Alfresco Repository + * %% + * Copyright (C) 2005 - 2016 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ package org.alfresco.repo.content.metadata; -import java.io.InputStream; -import java.io.Serializable; -import java.lang.reflect.Array; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Properties; -import java.util.Set; -import java.util.StringTokenizer; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; +import java.io.InputStream; +import java.io.Serializable; +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Properties; +import java.util.Set; +import java.util.StringTokenizer; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; -import java.util.concurrent.atomic.AtomicInteger; - -import javax.activation.MimeType; - +import java.util.concurrent.atomic.AtomicInteger; + +import javax.activation.MimeType; + import org.alfresco.api.AlfrescoPublicApi; import org.alfresco.error.AlfrescoRuntimeException; import org.alfresco.model.ContentModel; -import org.alfresco.repo.content.StreamAwareContentReaderProxy; -import org.alfresco.service.cmr.dictionary.DataTypeDefinition; -import org.alfresco.service.cmr.dictionary.DictionaryService; -import org.alfresco.service.cmr.dictionary.PropertyDefinition; -import org.alfresco.service.cmr.repository.ContentIOException; -import org.alfresco.service.cmr.repository.ContentReader; -import org.alfresco.service.cmr.repository.ContentWriter; -import org.alfresco.service.cmr.repository.MalformedNodeRefException; -import org.alfresco.service.cmr.repository.MimetypeService; -import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; -import org.alfresco.service.cmr.repository.datatype.TypeConversionException; -import org.alfresco.service.namespace.InvalidQNameException; -import org.alfresco.service.namespace.QName; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.joda.time.DateTime; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; -import org.springframework.beans.factory.BeanNameAware; -import org.springframework.context.ApplicationContext; -import org.springframework.context.ApplicationContextAware; -import org.springframework.extensions.surf.util.ISO8601DateFormat; +import org.alfresco.repo.content.StreamAwareContentReaderProxy; +import org.alfresco.service.cmr.dictionary.DataTypeDefinition; +import org.alfresco.service.cmr.dictionary.DictionaryService; +import org.alfresco.service.cmr.dictionary.PropertyDefinition; +import org.alfresco.service.cmr.repository.ContentIOException; +import org.alfresco.service.cmr.repository.ContentReader; +import org.alfresco.service.cmr.repository.ContentWriter; +import org.alfresco.service.cmr.repository.MalformedNodeRefException; +import org.alfresco.service.cmr.repository.MimetypeService; +import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; +import org.alfresco.service.cmr.repository.datatype.TypeConversionException; +import org.alfresco.service.namespace.InvalidQNameException; +import org.alfresco.service.namespace.QName; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.joda.time.DateTime; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; +import org.springframework.beans.factory.BeanNameAware; +import org.springframework.context.ApplicationContext; +import org.springframework.context.ApplicationContextAware; +import org.springframework.extensions.surf.util.ISO8601DateFormat; /** * Support class for metadata extracters that support dynamic and config-driven @@ -126,7 +126,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac private static final String PROP_DEFAULT_TIMEOUT = "content.metadataExtracter.default.timeoutMs"; public static final String PROPERTY_PREFIX_METADATA = "metadata."; public static final String PROPERTY_COMPONENT_EXTRACT = ".extract."; - public static final String PROPERTY_COMPONENT_EMBED = ".embed."; + public static final String PROPERTY_COMPONENT_EMBED = ".embed."; public static final int MEGABYTE_SIZE = 1048576; protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class); @@ -151,8 +151,8 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac private Properties properties; private Map mimetypeLimits; private ExecutorService executorService; - protected MetadataExtracterConfig metadataExtracterConfig; - + protected MetadataExtracterConfig metadataExtracterConfig; + private static final AtomicInteger CONCURRENT_EXTRACTIONS_COUNT = new AtomicInteger(0); /** @@ -259,7 +259,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac * {@inheritDoc} * * @see #setSupportedMimetypes(Collection) - */ + */ @Override public boolean isSupported(String sourceMimetype) { @@ -271,7 +271,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac * * @see #setSupportedEmbedMimetypes(Collection) */ - @Override + @Override public boolean isEmbeddingSupported(String sourceMimetype) { if (supportedEmbedMimetypes == null) @@ -314,18 +314,6 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac this.overwritePolicy = overwritePolicy; } - /** - * Set the policy to use when existing values are encountered. Depending on how the extractor - * is called, this may not be relevant, i.e an empty map of existing properties may be passed - * in by the client code, which may follow its own overwrite strategy. - * - * @param overwritePolicyStr the policy to apply when there are existing system properties - */ - public void setOverwritePolicy(String overwritePolicyStr) - { - this.overwritePolicy = OverwritePolicy.valueOf(overwritePolicyStr); - } - /** * Set whether the extractor should discard metadata that fails to convert to the target type * defined in the data dictionary model. This is true by default i.e. if the data @@ -1152,7 +1140,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac /** * {@inheritDoc} */ - @Override + @Override public final Map extract(ContentReader reader, Map destination) { return extract(reader, this.overwritePolicy, destination, this.mapping); @@ -1161,7 +1149,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac /** * {@inheritDoc} */ - @Override + @Override public final Map extract( ContentReader reader, OverwritePolicy overwritePolicy, @@ -1173,7 +1161,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac /** * {@inheritDoc} */ - @Override + @Override public Map extract( ContentReader reader, OverwritePolicy overwritePolicy, @@ -1225,12 +1213,12 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac logger.debug("Extracted Metadata from " + reader + "\n Found: " + rawMetadata + "\n Mapped and Accepted: " + changedProperties); } - } - catch (LimitExceededException e) - { - logger.warn("Metadata extraction rejected: \n" + - " Extracter: " + this + "\n" + - " Reason: " + e.getMessage()); + } + catch (LimitExceededException e) + { + logger.warn("Metadata extraction rejected: \n" + + " Extracter: " + this + "\n" + + " Reason: " + e.getMessage()); } catch (Throwable e) { @@ -1303,7 +1291,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac /** * {@inheritDoc} */ - @Override + @Override public final void embed( Map properties, ContentReader reader, @@ -1980,7 +1968,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac * Gets the metadata extracter limits for the given mimetype. *

* A specific match for the given mimetype is tried first and - * if none is found a wildcard of "*" is tried, if still not found + * if none is found a wildcard of "*" is tried, if still not found * defaults value will be used * * @param mimetype String @@ -1997,11 +1985,11 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac if (limits == null) { limits = mimetypeLimits.get("*"); - } - if (limits == null) - { - limits = new MetadataExtracterLimits(); - } + } + if (limits == null) + { + limits = new MetadataExtracterLimits(); + } return limits; } @@ -2045,19 +2033,19 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac { super(cause); } - } + } /** - * Exception wrapper to handle exceeded limits imposed by {@link MetadataExtracterLimits} - * {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader, MetadataExtracterLimits)} - */ - private class LimitExceededException extends Exception - { - private static final long serialVersionUID = 702554119174770130L; - public LimitExceededException(String message) - { - super(message); - } + * Exception wrapper to handle exceeded limits imposed by {@link MetadataExtracterLimits} + * {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader, MetadataExtracterLimits)} + */ + private class LimitExceededException extends Exception + { + private static final long serialVersionUID = 702554119174770130L; + public LimitExceededException(String message) + { + super(message); + } } /** @@ -2081,32 +2069,32 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac ContentReader reader, MetadataExtracterLimits limits) throws Throwable { FutureTask> task = null; - StreamAwareContentReaderProxy proxiedReader = null; - - if (reader.getSize() > limits.getMaxDocumentSizeMB() * MEGABYTE_SIZE) - { - throw new LimitExceededException("Max doc size exceeded " + limits.getMaxDocumentSizeMB() + " MB"); - } - - synchronized (CONCURRENT_EXTRACTIONS_COUNT) - { - if (logger.isDebugEnabled()) - { - logger.debug("Concurrent extractions : " + CONCURRENT_EXTRACTIONS_COUNT.get()); - } - if (CONCURRENT_EXTRACTIONS_COUNT.get() < limits.getMaxConcurrentExtractionsCount()) - { - int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.incrementAndGet(); - if (logger.isDebugEnabled()) - { - logger.debug("New extraction accepted. Concurrent extractions : " + totalDocCount); - } - } - else - { - throw new LimitExceededException("Reached concurrent extractions limit - " + limits.getMaxConcurrentExtractionsCount()); - } - } + StreamAwareContentReaderProxy proxiedReader = null; + + if (reader.getSize() > limits.getMaxDocumentSizeMB() * MEGABYTE_SIZE) + { + throw new LimitExceededException("Max doc size exceeded " + limits.getMaxDocumentSizeMB() + " MB"); + } + + synchronized (CONCURRENT_EXTRACTIONS_COUNT) + { + if (logger.isDebugEnabled()) + { + logger.debug("Concurrent extractions : " + CONCURRENT_EXTRACTIONS_COUNT.get()); + } + if (CONCURRENT_EXTRACTIONS_COUNT.get() < limits.getMaxConcurrentExtractionsCount()) + { + int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.incrementAndGet(); + if (logger.isDebugEnabled()) + { + logger.debug("New extraction accepted. Concurrent extractions : " + totalDocCount); + } + } + else + { + throw new LimitExceededException("Reached concurrent extractions limit - " + limits.getMaxConcurrentExtractionsCount()); + } + } try { @@ -2140,13 +2128,13 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } throw cause; } - finally - { - int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.decrementAndGet(); - if (logger.isDebugEnabled()) - { - logger.debug("Extraction finalized. Remaining concurrent extraction : " + totalDocCount); - } + finally + { + int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.decrementAndGet(); + if (logger.isDebugEnabled()) + { + logger.debug("Extraction finalized. Remaining concurrent extraction : " + totalDocCount); + } } } diff --git a/source/java/org/alfresco/repo/content/metadata/xml/XmlMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/xml/XmlMetadataExtracter.java index 4f307717d7..775e6ee9f6 100644 --- a/source/java/org/alfresco/repo/content/metadata/xml/XmlMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/xml/XmlMetadataExtracter.java @@ -1,28 +1,28 @@ -/* - * #%L - * Alfresco Repository - * %% - * Copyright (C) 2005 - 2016 Alfresco Software Limited - * %% - * This file is part of the Alfresco software. - * If the software was purchased under a paid Alfresco license, the terms of - * the paid license agreement will prevail. Otherwise, the software is - * provided under the following open source license terms: - * - * Alfresco is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Alfresco is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with Alfresco. If not, see . - * #L% - */ +/* + * #%L + * Alfresco Repository + * %% + * Copyright (C) 2005 - 2016 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ package org.alfresco.repo.content.metadata.xml; @@ -51,7 +51,7 @@ import org.alfresco.util.PropertyCheck; * most appropriate of a given XML document. The chosen extracter is then asked * to extract the values, passing through the * {@code MetadataExtracter.OverwritePolicy} as - * {@linkplain #setOverwritePolicy(String)} on this instance. The overwrite + * {@linkplain #setOverwritePolicy(org.alfresco.repo.content.metadata.MetadataExtracter.OverwritePolicy)} on this instance. The overwrite * policy of the embedded extracters is not relevant unless they are used * separately in another context. *