mirror of
				https://github.com/Alfresco/alfresco-community-repo.git
				synced 2025-10-22 15:12:38 +00:00 
			
		
		
		
	125842 rmunteanu: Merged V4.2-BUG-FIX (4.2.7) to 5.0.N (5.0.4) (PARTIAL MERGE)
      125700 adavis: Merged V4.2.5 (4.2.5.7) to V4.2-BUG-FIX (4.2.7)
         125698: Merged DEV to V4.2.5 (4.2.5.7)
            125677 arebegea: MNT-15219 : Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may cause OutOfMemory in Tika
               - Should not have updated version.properties as the original commit needs to be merged forwards.,
         125696: Merged DEV to V4.2.5 (4.2.5.7)
            125677 arebegea: MNT-15219 : Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may cause OutOfMemory in Tika
               - Modified tika parser and tika core jars to allow some configuration parameters to be sent from Alfresco side using the metadata map parameter
               - Excluded by default the parsing of drawings/shapes xmls because there was little valuable data that could be extracted from those xmls
git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.1.N/root@125892 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
		
	
		
			
				
	
	
		
			2164 lines
		
	
	
		
			86 KiB
		
	
	
	
		
			Java
		
	
	
	
	
	
			
		
		
	
	
			2164 lines
		
	
	
		
			86 KiB
		
	
	
	
		
			Java
		
	
	
	
	
	
| /*
 | |
|  * #%L
 | |
|  * Alfresco Repository
 | |
|  * %%
 | |
|  * Copyright (C) 2005 - 2016 Alfresco Software Limited
 | |
|  * %%
 | |
|  * This file is part of the Alfresco software. 
 | |
|  * If the software was purchased under a paid Alfresco license, the terms of 
 | |
|  * the paid license agreement will prevail.  Otherwise, the software is 
 | |
|  * provided under the following open source license terms:
 | |
|  * 
 | |
|  * Alfresco is free software: you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU Lesser General Public License as published by
 | |
|  * the Free Software Foundation, either version 3 of the License, or
 | |
|  * (at your option) any later version.
 | |
|  * 
 | |
|  * Alfresco is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  * GNU Lesser General Public License for more details.
 | |
|  * 
 | |
|  * You should have received a copy of the GNU Lesser General Public License
 | |
|  * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 | |
|  * #L%
 | |
|  */
 | |
| package org.alfresco.repo.content.metadata;
 | |
| 
 | |
| import java.io.InputStream;
 | |
| import java.io.Serializable;
 | |
| import java.lang.reflect.Array;
 | |
| import java.util.ArrayList;
 | |
| import java.util.Collection;
 | |
| import java.util.Collections;
 | |
| import java.util.Date;
 | |
| import java.util.HashMap;
 | |
| import java.util.HashSet;
 | |
| import java.util.List;
 | |
| import java.util.Locale;
 | |
| import java.util.Map;
 | |
| import java.util.Map.Entry;
 | |
| import java.util.Properties;
 | |
| import java.util.Set;
 | |
| import java.util.StringTokenizer;
 | |
| import java.util.concurrent.Callable;
 | |
| import java.util.concurrent.ExecutionException;
 | |
| import java.util.concurrent.ExecutorService;
 | |
| import java.util.concurrent.Executors;
 | |
| import java.util.concurrent.FutureTask;
 | |
| import java.util.concurrent.TimeUnit;
 | |
| import java.util.concurrent.TimeoutException;
 | |
| 
 | |
| import org.alfresco.api.AlfrescoPublicApi;     
 | |
| import org.alfresco.error.AlfrescoRuntimeException;
 | |
| import org.alfresco.model.ContentModel;
 | |
| import org.alfresco.repo.content.StreamAwareContentReaderProxy;
 | |
| import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
 | |
| import org.alfresco.service.cmr.dictionary.DictionaryService;
 | |
| import org.alfresco.service.cmr.dictionary.PropertyDefinition;
 | |
| import org.alfresco.service.cmr.repository.ContentIOException;
 | |
| import org.alfresco.service.cmr.repository.ContentReader;
 | |
| import org.alfresco.service.cmr.repository.ContentWriter;
 | |
| import org.alfresco.service.cmr.repository.MalformedNodeRefException;
 | |
| import org.alfresco.service.cmr.repository.MimetypeService;
 | |
| import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
 | |
| import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
 | |
| import org.alfresco.service.namespace.InvalidQNameException;
 | |
| import org.alfresco.service.namespace.QName;
 | |
| import org.apache.commons.logging.Log;
 | |
| import org.apache.commons.logging.LogFactory;
 | |
| import org.apache.xmlbeans.impl.xb.xsdschema.All;
 | |
| import org.joda.time.DateTime;
 | |
| import org.joda.time.format.DateTimeFormat;
 | |
| import org.joda.time.format.DateTimeFormatter;
 | |
| import org.springframework.beans.factory.BeanNameAware;
 | |
| import org.springframework.context.ApplicationContext;
 | |
| import org.springframework.context.ApplicationContextAware;
 | |
| import org.springframework.extensions.surf.util.ISO8601DateFormat;
 | |
| 
 | |
| /**
 | |
|  * Support class for metadata extracters that support dynamic and config-driven
 | |
|  * mapping between extracted values and model properties.  Extraction is broken
 | |
|  * up into two phases:
 | |
|  * <ul>
 | |
|  *   <li>Extract ALL available metadata from the document.</li>
 | |
|  *   <li>Translate the metadata into system properties.</li>
 | |
|  * </ul>
 | |
|  * <p>
 | |
|  * Migrating an existing extracter to use this class is straightforward:
 | |
|  * <ul>
 | |
|  *   <li>
 | |
|  *   Construct the extracter providing a default set of supported mimetypes to this
 | |
|  *   implementation.  This can be overwritten with configurations.
 | |
|  *   </li>
 | |
|  *   <li>
 | |
|  *   Implement the {@link #extract} method.  This now returns a raw map of extracted
 | |
|  *   values keyed by document-specific property names.  The <b>trimPut</b> method has
 | |
|  *   been replaced with an equivalent {@link #putRawValue(String, Serializable, Map)}.
 | |
|  *   </li>
 | |
|  *   <li>
 | |
|  *   Provide the default mapping of the document-specific properties to system-specific
 | |
|  *   properties as describe by the {@link #getDefaultMapping()} method.  The simplest
 | |
|  *   is to provide the default mapping in a correlated <i>.properties</i> file.
 | |
|  *   </li>
 | |
|  *   <li>
 | |
|  *   Document, in the class-level javadoc, all the available properties that are extracted
 | |
|  *   along with their approximate meanings.  Add to this, the default mappings.
 | |
|  *   </li>
 | |
|  * </ul>
 | |
|  * 
 | |
|  * @see #getDefaultMapping()
 | |
|  * @see #extractRaw(ContentReader)
 | |
|  * @see #setMapping(Map)
 | |
|  * 
 | |
|  * @since 2.1
 | |
|  * 
 | |
|  * @author Jesper Steen Møller
 | |
|  * @author Derek Hulley
 | |
|  */
 | |
| @AlfrescoPublicApi
 | |
| abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter, MetadataEmbedder, BeanNameAware, ApplicationContextAware
 | |
| {
 | |
|     public static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix.";
 | |
|     private static final String ERR_TYPE_CONVERSION = "metadata.extraction.err.type_conversion";
 | |
|     private static final String PROP_DEFAULT_TIMEOUT = "content.metadataExtracter.default.timeoutMs";
 | |
|     public static final String PROPERTY_PREFIX_METADATA = "metadata.";
 | |
|     public static final String PROPERTY_COMPONENT_EXTRACT = ".extract.";
 | |
|     public static final String PROPERTY_COMPONENT_EMBED = ".embed.";
 | |
|     
 | |
|     protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class);
 | |
|     
 | |
|     private MetadataExtracterRegistry registry;
 | |
|     private MimetypeService mimetypeService;
 | |
|     private DictionaryService dictionaryService;
 | |
|     private boolean initialized;
 | |
|     
 | |
|     private Set<String> supportedMimetypes;
 | |
|     private Set<String> supportedEmbedMimetypes;
 | |
|     private OverwritePolicy overwritePolicy;
 | |
|     private boolean failOnTypeConversion;
 | |
|     private Set<DateTimeFormatter> supportedDateFormatters;
 | |
|     private Map<String, Set<QName>> mapping;
 | |
|     private Map<QName, Set<String>> embedMapping;
 | |
|     private boolean inheritDefaultMapping;
 | |
|     private boolean inheritDefaultEmbedMapping;
 | |
|     private boolean enableStringTagging;
 | |
|     private String beanName;
 | |
|     private ApplicationContext applicationContext;
 | |
|     private Properties properties;
 | |
|     private Map<String, MetadataExtracterLimits> mimetypeLimits;
 | |
|     private ExecutorService executorService;
 | |
|     protected MetadataExtracterConfig metadataExtracterConfig;
 | |
| 
 | |
|     /**
 | |
|      * Default constructor.  If this is called, then {@link #isSupported(String)} should
 | |
|      * be implemented.  This is useful when the list of supported mimetypes is not known
 | |
|      * when the instance is constructed.  Alternatively, once the set becomes known, call
 | |
|      * {@link #setSupportedMimetypes(Collection)}.
 | |
|      *
 | |
|      * @see #isSupported(String)
 | |
|      * @see #setSupportedMimetypes(Collection)
 | |
|      */
 | |
|     protected AbstractMappingMetadataExtracter()
 | |
|     {
 | |
|         this(Collections.<String>emptySet());
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Constructor that can be used when the list of supported mimetypes is known up front.
 | |
|      * 
 | |
|      * @param supportedMimetypes    the set of mimetypes supported by default
 | |
|      */
 | |
|     protected AbstractMappingMetadataExtracter(Set<String> supportedMimetypes)
 | |
|     {
 | |
|         this.supportedMimetypes = supportedMimetypes;
 | |
|         // Set defaults
 | |
|         overwritePolicy = OverwritePolicy.PRAGMATIC;
 | |
|         failOnTypeConversion = true;
 | |
|         mapping = null;                     // The default will be fetched
 | |
|         embedMapping = null;
 | |
|         inheritDefaultMapping = false;      // Any overrides are complete 
 | |
|         inheritDefaultEmbedMapping = false;
 | |
|         initialized = false;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Constructor that can be used when the list of supported extract and embed mimetypes is known up front.
 | |
|      *
 | |
|      * @param supportedMimetypes    the set of mimetypes supported for extraction by default
 | |
|      * @param supportedEmbedMimetypes    the set of mimetypes supported for embedding by default
 | |
|      */
 | |
|     protected AbstractMappingMetadataExtracter(Set<String> supportedMimetypes, Set<String> supportedEmbedMimetypes)
 | |
|     {
 | |
|         this(supportedMimetypes);
 | |
|         this.supportedEmbedMimetypes = supportedEmbedMimetypes;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set the registry to register with.  If this is not set, then the default
 | |
|      * initialization will not auto-register the extracter for general use.  It
 | |
|      * can still be used directly.
 | |
|      * 
 | |
|      * @param registry a metadata extracter registry
 | |
|      */
 | |
|     public void setRegistry(MetadataExtracterRegistry registry)
 | |
|     {
 | |
|         this.registry = registry;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * @param mimetypeService       the mimetype service.  Set this if required.
 | |
|      */
 | |
|     public void setMimetypeService(MimetypeService mimetypeService)
 | |
|     {
 | |
|         this.mimetypeService = mimetypeService;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * @return Returns the mimetype helper
 | |
|      */
 | |
|     protected MimetypeService getMimetypeService()
 | |
|     {
 | |
|         return mimetypeService;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * @param dictionaryService     the dictionary service to determine which data conversions are necessary
 | |
|      */
 | |
|     public void setDictionaryService(DictionaryService dictionaryService)
 | |
|     {
 | |
|         this.dictionaryService = dictionaryService;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set the mimetypes that are supported by the extracter.
 | |
|      * 
 | |
|      * @param supportedMimetypes Collection<String>
 | |
|      */
 | |
|     public void setSupportedMimetypes(Collection<String> supportedMimetypes)
 | |
|     {
 | |
|         this.supportedMimetypes.clear();
 | |
|         this.supportedMimetypes.addAll(supportedMimetypes);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set the mimetypes that are supported for embedding.
 | |
|      *
 | |
|      * @param supportedEmbedMimetypes Collection<String>
 | |
|      */
 | |
|     public void setSupportedEmbedMimetypes(Collection<String> supportedEmbedMimetypes)
 | |
|     {
 | |
|         this.supportedEmbedMimetypes.clear();
 | |
|         this.supportedEmbedMimetypes.addAll(supportedEmbedMimetypes);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * {@inheritDoc}
 | |
|      * 
 | |
|      * @see #setSupportedMimetypes(Collection)
 | |
|      */
 | |
|     public boolean isSupported(String sourceMimetype)
 | |
|     {
 | |
|         return supportedMimetypes.contains(sourceMimetype) && isEnabled(sourceMimetype);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * {@inheritDoc}
 | |
|      *
 | |
|      * @see #setSupportedEmbedMimetypes(Collection)
 | |
|      */
 | |
|     public boolean isEmbeddingSupported(String sourceMimetype)
 | |
|     {
 | |
|         if (supportedEmbedMimetypes == null)
 | |
|         {
 | |
|             return false;
 | |
|         }
 | |
|         return supportedEmbedMimetypes.contains(sourceMimetype);
 | |
|     }
 | |
| 
 | |
|     private boolean isEnabled(String mimetype)
 | |
|     {
 | |
|         return properties == null || mimetypeService == null ||
 | |
|                (getBooleanProperty(beanName+".enabled", true) &&
 | |
|                 getBooleanProperty(beanName+'.'+mimetypeService.getExtension(mimetype)+".enabled", true));
 | |
|     }
 | |
| 
 | |
|     private boolean getBooleanProperty(String name, boolean defaultValue)
 | |
|     {
 | |
|         boolean value = defaultValue;
 | |
|         if (properties != null)
 | |
|         {
 | |
|             String property = properties.getProperty(name);
 | |
|             if (property != null)
 | |
|             {
 | |
|                 value = property.trim().equalsIgnoreCase("true");
 | |
|             }
 | |
|         }
 | |
|         return value;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
 | |
|      * @return      Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
 | |
|      * 
 | |
|      * @see #isSupported(String)
 | |
|      */
 | |
|     public double getReliability(String mimetype)
 | |
|     {
 | |
|         return isSupported(mimetype) ? 1.0D : 0.0D;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set the policy to use when existing values are encountered.  Depending on how the extractor
 | |
|      * is called, this may not be relevant, i.e an empty map of existing properties may be passed
 | |
|      * in by the client code, which may follow its own overwrite strategy.
 | |
|      * 
 | |
|      * @param overwritePolicy       the policy to apply when there are existing system properties
 | |
|      */
 | |
|     public void setOverwritePolicy(OverwritePolicy overwritePolicy)
 | |
|     {
 | |
|         this.overwritePolicy = overwritePolicy;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set the policy to use when existing values are encountered.  Depending on how the extractor
 | |
|      * is called, this may not be relevant, i.e an empty map of existing properties may be passed
 | |
|      * in by the client code, which may follow its own overwrite strategy.
 | |
|      * 
 | |
|      * @param overwritePolicyStr    the policy to apply when there are existing system properties
 | |
|      */
 | |
|     public void setOverwritePolicy(String overwritePolicyStr)
 | |
|     {
 | |
|         this.overwritePolicy = OverwritePolicy.valueOf(overwritePolicyStr);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set whether the extractor should discard metadata that fails to convert to the target type
 | |
|      * defined in the data dictionary model.  This is <tt>true</tt> by default i.e. if the data
 | |
|      * extracted is not compatible with the target model then the extraction will fail.  If this is
 | |
|      * <tt>false<tt> then any extracted data that fails to convert will be discarded.
 | |
|      * 
 | |
|      * @param failOnTypeConversion      <tt>false</tt> to discard properties that can't get converted
 | |
|      *                                  to the dictionary-defined type, or <tt>true</tt> (default)
 | |
|      *                                  to fail the extraction if the type doesn't convert
 | |
|      */
 | |
|     public void setFailOnTypeConversion(boolean failOnTypeConversion)
 | |
|     {
 | |
|         this.failOnTypeConversion = failOnTypeConversion;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set the date formats, over and above the {@link ISO8601DateFormat ISO8601 format}, that will
 | |
|      * be supported for string to date conversions.  The supported syntax is described by the
 | |
|      * <a href="http://java.sun.com/j2se/1.5.0/docs/api/java/text/SimpleDateFormat.html">SimpleDateFormat Javadocs</a>.
 | |
|      * 
 | |
|      * @param supportedDateFormats      a list of supported date formats.
 | |
|      */
 | |
|     public void setSupportedDateFormats(List<String> supportedDateFormats)
 | |
|     {
 | |
|         supportedDateFormatters = new HashSet<DateTimeFormatter>();
 | |
|         
 | |
|         // Note: The previous version attempted to create a single DateTimeFormatter from
 | |
|         // multiple DateTimeFormatters, but that does not work as the time zone part is lost.
 | |
|         // Now have a set of them.
 | |
|         for (String dateFormatStr : supportedDateFormats)
 | |
|         {
 | |
|             try
 | |
|             {
 | |
|                 supportedDateFormatters.add(DateTimeFormat.forPattern(dateFormatStr));
 | |
|             }
 | |
|             catch (Throwable e)
 | |
|             {
 | |
|                 // No good
 | |
|                 throw new AlfrescoRuntimeException("Unable to set supported date format: " + dateFormatStr, e);
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set if the property mappings augment or override the mapping generically provided by the
 | |
|      * extracter implementation.  The default is <tt>false</tt>, i.e. any mapping set completely
 | |
|      * replaces the {@link #getDefaultMapping() default mappings}.
 | |
|      * <p>
 | |
|      * Note that even when set to <tt>true</tt> an individual property mapping entry replaces the
 | |
|      * entry provided by the extracter implementation.
 | |
|      * 
 | |
|      * @param inheritDefaultMapping <tt>true</tt> to add the configured mapping
 | |
|      *                              to the list of default mappings.
 | |
|      * 
 | |
|      * @see #getDefaultMapping()
 | |
|      * @see #setMapping(Map)
 | |
|      * @see #setMappingProperties(Properties)
 | |
|      */
 | |
|     public void setInheritDefaultMapping(boolean inheritDefaultMapping)
 | |
|     {
 | |
|         this.inheritDefaultMapping = inheritDefaultMapping;
 | |
|     }
 | |
| 
 | |
|     @Override
 | |
|     public void setBeanName(String beanName)
 | |
|     {
 | |
|         this.beanName = beanName;
 | |
|     }
 | |
|     
 | |
|     public String getBeanName()
 | |
|     {
 | |
|         return beanName;
 | |
|     }
 | |
|     
 | |
|     public void setApplicationContext(ApplicationContext applicationContext)
 | |
|     {
 | |
|         this.applicationContext = applicationContext;
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * The Alfresco global properties.
 | |
|      */
 | |
|     public void setProperties(Properties properties)
 | |
|     {
 | |
|         this.properties = properties;
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * The metadata extracter config.
 | |
|      */
 | |
|     public void setMetadataExtracterConfig(MetadataExtracterConfig metadataExtracterConfig)
 | |
|     {
 | |
|         this.metadataExtracterConfig = metadataExtracterConfig;
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * Whether or not to enable the pass through of simple strings to cm:taggable tags
 | |
|      * 
 | |
|      * @param enableStringTagging       <tt>true</tt> find or create tags for each string 
 | |
|      *                                  mapped to cm:taggable.  <tt>false</tt> (default) 
 | |
|      *                                  ignore mapping strings to tags.
 | |
|      */
 | |
|     public void setEnableStringTagging(boolean enableStringTagging)
 | |
|     {
 | |
|         this.enableStringTagging = enableStringTagging;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set if the embed property mappings augment or override the mapping generically provided by the
 | |
|      * extracter implementation.  The default is <tt>false</tt>, i.e. any mapping set completely
 | |
|      * replaces the {@link #getDefaultEmbedMapping() default mappings}.
 | |
|      * <p>
 | |
|      * Note that even when set to <tt>true</tt> an individual property mapping entry replaces the
 | |
|      * entry provided by the extracter implementation.
 | |
|      *
 | |
|      * @param inheritDefaultEmbedMapping <tt>true</tt> to add the configured embed mapping
 | |
|      *                              to the list of default embed mappings.
 | |
|      *
 | |
|      * @see #getDefaultEmbedMapping()
 | |
|      * @see #setEmbedMapping(Map)
 | |
|      * @see #setEmbedMappingProperties(Properties)
 | |
|      */
 | |
|     public void setInheritDefaultEmbedMapping(boolean inheritDefaultEmbedMapping)
 | |
|     {
 | |
|         this.inheritDefaultEmbedMapping = inheritDefaultEmbedMapping;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Sets the map of source mimetypes to metadata extracter limits.
 | |
|      * 
 | |
|      * @param mimetypeLimits Map<String, MetadataExtracterLimits>
 | |
|      */
 | |
|     public void setMimetypeLimits(Map<String, MetadataExtracterLimits> mimetypeLimits)
 | |
|     {
 | |
|         this.mimetypeLimits = mimetypeLimits;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Gets the <code>ExecutorService</code> to be used for timeout-aware
 | |
|      * extraction.
 | |
|      * <p>
 | |
|      * If no <code>ExecutorService</code> has been defined a default
 | |
|      * of <code>Executors.newCachedThreadPool()</code> is used during
 | |
|      * {@link AbstractMappingMetadataExtracter#init()}.
 | |
|      * 
 | |
|      * @return the defined or default <code>ExecutorService</code>
 | |
|      */
 | |
|     protected ExecutorService getExecutorService()
 | |
|     {
 | |
|         return executorService;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Sets the <code>ExecutorService</code> to be used for timeout-aware
 | |
|      * extraction.
 | |
|      * 
 | |
|      * @param executorService the <code>ExecutorService</code> for timeouts
 | |
|      */
 | |
|     public void setExecutorService(ExecutorService executorService)
 | |
|     {
 | |
|         this.executorService = executorService;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set the mapping from document metadata to system metadata.  It is possible to direct
 | |
|      * an extracted document property to several system properties.  The conversion between
 | |
|      * the document property types and the system property types will be done by the
 | |
|      * {@link org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter default converter}.
 | |
|      * 
 | |
|      * @param mapping       a mapping from document metadata to system metadata
 | |
|      */
 | |
|     public void setMapping(Map<String, Set<QName>> mapping)
 | |
|     {
 | |
|         this.mapping = mapping;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set the embed mapping from document metadata to system metadata.  It is possible to direct
 | |
|      * an model properties to several content file metadata keys.  The conversion between
 | |
|      * the model property types and the content file metadata keys types will be done by the
 | |
|      * {@link org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter default converter}.
 | |
|      *
 | |
|      * @param embedMapping       an embed mapping from model properties to content file metadata keys
 | |
|      */
 | |
|     public void setEmbedMapping(Map<QName, Set<String>> embedMapping)
 | |
|     {
 | |
|         this.embedMapping = embedMapping;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set the properties that contain the mapping from document metadata to system metadata.
 | |
|      * This is an alternative to the {@link #setMapping(Map)} method.  Any mappings already
 | |
|      * present will be cleared out.
 | |
|      * 
 | |
|      * The property mapping is of the form:
 | |
|      * <pre>
 | |
|      * # Namespaces prefixes
 | |
|      * namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
 | |
|      * namespace.prefix.my=http://www....com/alfresco/1.0
 | |
|      * 
 | |
|      * # Mapping
 | |
|      * editor=cm:author, my:editor
 | |
|      * title=cm:title
 | |
|      * user1=cm:summary
 | |
|      * user2=cm:description
 | |
|      * </pre>
 | |
|      * The mapping can therefore be from a single document property onto several system properties.
 | |
|      * 
 | |
|      * @param mappingProperties     the properties that map document properties to system properties
 | |
|      */
 | |
|     public void setMappingProperties(Properties mappingProperties)
 | |
|     {
 | |
|         mapping = readMappingProperties(mappingProperties);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Set the properties that contain the embed mapping from model properties to content file metadata.
 | |
|      * This is an alternative to the {@link #setEmbedMapping(Map)} method.  Any mappings already
 | |
|      * present will be cleared out.
 | |
|      *
 | |
|      * The property mapping is of the form:
 | |
|      * <pre>
 | |
|      * # Namespaces prefixes
 | |
|      * namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
 | |
|      * namespace.prefix.my=http://www....com/alfresco/1.0
 | |
|      *
 | |
|      * # Mapping
 | |
|      * cm\:author=editor
 | |
|      * cm\:title=title
 | |
|      * cm\:summary=user1
 | |
|      * cm\:description=description,user2
 | |
|      * </pre>
 | |
|      * The embed mapping can therefore be from a model property onto several content file metadata properties.
 | |
|      *
 | |
|      * @param embedMappingProperties     the properties that map model properties to content file metadata properties
 | |
|      */
 | |
|     public void setEmbedMappingProperties(Properties embedMappingProperties)
 | |
|     {
 | |
|         embedMapping = readEmbedMappingProperties(embedMappingProperties);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Helper method for derived classes to obtain the mappings that will be applied to raw
 | |
|      * values.  This should be called after initialization in order to guarantee the complete
 | |
|      * map is given.
 | |
|      * <p>
 | |
|      * Normally, the list of properties that can be extracted from a document is fixed and
 | |
|      * well-known - in that case, just extract everything.  But Some implementations may have
 | |
|      * an extra, indeterminate set of values available for extraction.  If the extraction of
 | |
|      * these runtime parameters is expensive, then the keys provided by the return value can
 | |
|      * be used to extract values from the documents.  The metadata extraction becomes fully
 | |
|      * configuration-driven, i.e. declaring further mappings will result in more values being
 | |
|      * extracted from the documents.
 | |
|      * <p>
 | |
|      * Most extractors will not be using this method.  For an example of its use, see the
 | |
|      * {@linkplain OpenDocumentMetadataExtracter OpenDocument extractor}, which uses the mapping
 | |
|      * to select specific user properties from a document.
 | |
|      */
 | |
|     protected final Map<String, Set<QName>> getMapping()
 | |
|     {
 | |
|         if (!initialized)
 | |
|         {
 | |
|             throw new UnsupportedOperationException("The complete mapping is only available after initialization.");
 | |
|         }
 | |
|         return Collections.unmodifiableMap(mapping);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Helper method for derived classes to obtain the embed mappings.
 | |
|      * This should be called after initialization in order to guarantee the complete
 | |
|      * map is given.
 | |
|      * <p>
 | |
|      * Normally, the list of properties that can be embedded in a document is fixed and
 | |
|      * well-known..  But some implementations may have
 | |
|      * an extra, indeterminate set of values available for embedding.  If the embedding of
 | |
|      * these runtime parameters is expensive, then the keys provided by the return value can
 | |
|      * be used to embed values in the documents.  The metadata embedding becomes fully
 | |
|      * configuration-driven, i.e. declaring further mappings will result in more values being
 | |
|      * embedded in the documents.
 | |
|      */
 | |
|     protected final Map<QName, Set<String>> getEmbedMapping()
 | |
|     {
 | |
|         if (!initialized)
 | |
|         {
 | |
|             throw new UnsupportedOperationException("The complete embed mapping is only available after initialization.");
 | |
|         }
 | |
|         return Collections.unmodifiableMap(embedMapping);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * A utility method to read mapping properties from a resource file and convert to the map form.
 | |
|      * 
 | |
|      * @param propertiesUrl     A standard Properties file URL location
 | |
|      * 
 | |
|      * @see #setMappingProperties(Properties)
 | |
|      */
 | |
|     protected Map<String, Set<QName>> readMappingProperties(String propertiesUrl)
 | |
|     {
 | |
|         InputStream is = null;
 | |
|         try
 | |
|         {
 | |
|             is = getClass().getClassLoader().getResourceAsStream(propertiesUrl);
 | |
|             if(is == null)
 | |
|             {
 | |
|                 throw new AlfrescoRuntimeException(
 | |
|                         "Metadata Extracter mapping properties not found: \n" +
 | |
|                         "   Extracter:  " + this + "\n" +
 | |
|                         "   Bundle:     " + propertiesUrl);
 | |
|             }
 | |
|             Properties props = new Properties();
 | |
|             props.load(is);
 | |
|             // Process it
 | |
|             Map<String, Set<QName>> map = readMappingProperties(props);
 | |
|             // Done
 | |
|             if (logger.isDebugEnabled())
 | |
|             {
 | |
|                 logger.debug("Loaded mapping properties from resource: " + propertiesUrl);
 | |
|             }
 | |
|             return map;
 | |
|         }
 | |
|         catch (Throwable e)
 | |
|         {
 | |
|             throw new AlfrescoRuntimeException(
 | |
|                     "Unable to load properties file to read extracter mapping properties: \n" +
 | |
|                     "   Extracter:  " + this + "\n" +
 | |
|                     "   Bundle:     " + propertiesUrl,
 | |
|                     e);
 | |
|         }
 | |
|         finally
 | |
|         {
 | |
|             if (is != null)
 | |
|             {
 | |
|                 try { is.close(); } catch (Throwable e) {}
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * A utility method to convert global properties to the Map form for the given
 | |
|      * propertyComponent.
 | |
|      * <p>
 | |
|      * Mappings can be specified using the same method defined for
 | |
|      * normal mapping properties files but with a prefix of
 | |
|      * <code>metadata.extracter</code>, the extracter bean name, and the propertyComponent.
 | |
|      * For example:
 | |
|      * 
 | |
|      *     metadata.extracter.TikaAuto.extract.namespace.prefix.my=http://DummyMappingMetadataExtracter
 | |
|      *     metadata.extracter.TikaAuto.extract.namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
 | |
|      *     metadata.extracter.TikaAuto.extract.dc\:description=cm:description, my:customDescription
 | |
|      * 
 | |
|      */
 | |
|     private Map<Object, Object> getRelevantGlobalProperties(String propertyComponent)
 | |
|     {
 | |
|         if (applicationContext == null)
 | |
|         {
 | |
|             logger.info("ApplicationContext not set");
 | |
|             return null;
 | |
|         }
 | |
|         Properties globalProperties = (Properties) applicationContext.getBean("global-properties");
 | |
|         if (globalProperties == null)
 | |
|         {
 | |
|             logger.info("Could not get global-properties");
 | |
|             return null;
 | |
|         }
 | |
|         Map<Object, Object> relevantGlobalPropertiesMap = 
 | |
|                 new HashMap<Object, Object>();
 | |
|         String propertyPrefix = PROPERTY_PREFIX_METADATA + beanName + propertyComponent;
 | |
|         for (Entry<Object, Object> globalEntry : globalProperties.entrySet())
 | |
|         {
 | |
|             if (((String) globalEntry.getKey()).startsWith(propertyPrefix))
 | |
|             {
 | |
|                 relevantGlobalPropertiesMap.put(
 | |
|                         ((String) globalEntry.getKey()).replace(propertyPrefix, ""),
 | |
|                         globalEntry.getValue());
 | |
|             }
 | |
|         }
 | |
|         return relevantGlobalPropertiesMap;
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * A utility method to convert global properties to the Map form for the given
 | |
|      * propertyComponent.
 | |
|      * <p>
 | |
|      * Mappings can be specified using the same method defined for
 | |
|      * normal mapping properties files but with a prefix of
 | |
|      * <code>metadata.extracter</code>, the extracter bean name, and the extract component.
 | |
|      * For example:
 | |
|      * 
 | |
|      *     metadata.extracter.TikaAuto.extract.namespace.prefix.my=http://DummyMappingMetadataExtracter
 | |
|      *     metadata.extracter.TikaAuto.extract.namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
 | |
|      *     metadata.extracter.TikaAuto.extract.dc\:description=cm:description, my:customDescription
 | |
|      * 
 | |
|      */
 | |
|     protected Map<String, Set<QName>> readGlobalExtractMappingProperties()
 | |
|     {
 | |
|         Map<Object, Object> relevantGlobalPropertiesMap = getRelevantGlobalProperties(PROPERTY_COMPONENT_EXTRACT);
 | |
|         if (relevantGlobalPropertiesMap == null)
 | |
|         {
 | |
|             return null;
 | |
|         }
 | |
|         return readMappingProperties(relevantGlobalPropertiesMap.entrySet());
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * A utility method to convert mapping properties to the Map form.
 | |
|      * 
 | |
|      * @see #setMappingProperties(Properties)
 | |
|      */
 | |
|     protected Map<String, Set<QName>> readMappingProperties(Properties mappingProperties)
 | |
|     {
 | |
|         return readMappingProperties(mappingProperties.entrySet());
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * A utility method to convert mapping properties entries to the Map form.
 | |
|      * 
 | |
|      * @see #setMappingProperties(Properties)
 | |
|      */
 | |
|     private Map<String, Set<QName>> readMappingProperties(Set<Entry<Object, Object>> mappingPropertiesEntries)
 | |
|     {
 | |
|         Map<String, String> namespacesByPrefix = new HashMap<String, String>(5);
 | |
|         // Get the namespaces
 | |
|         for (Map.Entry<Object, Object> entry : mappingPropertiesEntries)
 | |
|         {
 | |
|             String propertyName = (String) entry.getKey();
 | |
|             if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX))
 | |
|             {
 | |
|                 String prefix = propertyName.substring(17);
 | |
|                 String namespace = (String) entry.getValue();
 | |
|                 namespacesByPrefix.put(prefix, namespace);
 | |
|             }
 | |
|         }
 | |
|         // Create the mapping
 | |
|         Map<String, Set<QName>> convertedMapping = new HashMap<String, Set<QName>>(17);
 | |
|         for (Map.Entry<Object, Object> entry : mappingPropertiesEntries)
 | |
|         {
 | |
|             String documentProperty = (String) entry.getKey();
 | |
|             String qnamesStr = (String) entry.getValue();
 | |
|             if (documentProperty.startsWith(NAMESPACE_PROPERTY_PREFIX))
 | |
|             {
 | |
|                 // Ignore these now
 | |
|                 continue;
 | |
|             }
 | |
|             // Create the entry
 | |
|             Set<QName> qnames = new HashSet<QName>(3);
 | |
|             convertedMapping.put(documentProperty, qnames);
 | |
|             // The to value can be a list of QNames
 | |
|             StringTokenizer tokenizer = new StringTokenizer(qnamesStr, ",");
 | |
|             while (tokenizer.hasMoreTokens())
 | |
|             {
 | |
|                 String qnameStr = tokenizer.nextToken().trim();
 | |
|                 // Check if we need to resolve a namespace reference
 | |
|                 int index = qnameStr.indexOf(QName.NAMESPACE_PREFIX);
 | |
|                 if (index > -1 && qnameStr.charAt(0) != QName.NAMESPACE_BEGIN)
 | |
|                 {
 | |
|                     String prefix = qnameStr.substring(0, index);
 | |
|                     String suffix = qnameStr.substring(index + 1);
 | |
|                     // It is prefixed
 | |
|                     String uri = namespacesByPrefix.get(prefix);
 | |
|                     if (uri == null)
 | |
|                     {
 | |
|                         throw new AlfrescoRuntimeException(
 | |
|                                 "No prefix mapping for extracter property mapping: \n" +
 | |
|                                 "   Extracter: " + this + "\n" +
 | |
|                                 "   Mapping: " + entry);
 | |
|                     }
 | |
|                     qnameStr = QName.NAMESPACE_BEGIN + uri + QName.NAMESPACE_END + suffix;
 | |
|                 }
 | |
|                 try
 | |
|                 {
 | |
|                     QName qname = QName.createQName(qnameStr);
 | |
|                     // Add it to the mapping
 | |
|                     qnames.add(qname);
 | |
|                 }
 | |
|                 catch (InvalidQNameException e)
 | |
|                 {
 | |
|                     throw new AlfrescoRuntimeException(
 | |
|                             "Can't create metadata extracter property mapping: \n" +
 | |
|                             "   Extracter: " + this + "\n" +
 | |
|                             "   Mapping: " + entry);
 | |
|                 }
 | |
|             }
 | |
|             if (logger.isTraceEnabled())
 | |
|             {
 | |
|                 logger.trace("Added mapping from " + documentProperty + " to " + qnames);
 | |
|             }
 | |
|         }
 | |
|         // Done
 | |
|         return convertedMapping;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * A utility method to read embed mapping properties from a resource file and convert to the map form.
 | |
|      *
 | |
|      * @param propertiesUrl     A standard Properties file URL location
 | |
|      *
 | |
|      * @see #setEmbedMappingProperties(Properties)
 | |
|      */
 | |
|     protected Map<QName, Set<String>> readEmbedMappingProperties(String propertiesUrl)
 | |
|     {
 | |
|         InputStream is = null;
 | |
|         try
 | |
|         {
 | |
|             is = getClass().getClassLoader().getResourceAsStream(propertiesUrl);
 | |
|             if(is == null)
 | |
|             {
 | |
|                 return null;
 | |
|             }
 | |
|             Properties props = new Properties();
 | |
|             props.load(is);
 | |
|             // Process it
 | |
|             Map<QName, Set<String>> map = readEmbedMappingProperties(props);
 | |
|             // Done
 | |
|             if (logger.isDebugEnabled())
 | |
|             {
 | |
|                 logger.debug("Loaded embed mapping properties from resource: " + propertiesUrl);
 | |
|             }
 | |
|             return map;
 | |
|         }
 | |
|         catch (Throwable e)
 | |
|         {
 | |
|             throw new AlfrescoRuntimeException(
 | |
|                     "Unable to load properties file to read extracter embed mapping properties: \n" +
 | |
|                     "   Extracter:  " + this + "\n" +
 | |
|                     "   Bundle:     " + propertiesUrl,
 | |
|                     e);
 | |
|         }
 | |
|         finally
 | |
|         {
 | |
|             if (is != null)
 | |
|             {
 | |
|                 try { is.close(); } catch (Throwable e) {}
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * A utility method to convert global mapping properties to the Map form.
 | |
|      * <p>
 | |
|      * Different from readGlobalExtractMappingProperties in that keys are the Alfresco QNames
 | |
|      * and values are file metadata properties.
 | |
|      * <p>
 | |
|      * Mappings can be specified using the same method defined for
 | |
|      * normal embed mapping properties files but with a prefix of
 | |
|      * <code>metadata.extracter</code>, the extracter bean name, and the embed component.
 | |
|      * For example:
 | |
|      * 
 | |
|      *     metadata.extracter.TikaAuto.embed.namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
 | |
|      *     metadata.extracter.TikaAuto.embed.cm\:description=description
 | |
|      *
 | |
|      * @see #setMappingProperties(Properties)
 | |
|      */
 | |
|     protected Map<QName, Set<String>> readGlobalEmbedMappingProperties()
 | |
|     {
 | |
|         Map<Object, Object> relevantGlobalPropertiesMap = getRelevantGlobalProperties(PROPERTY_COMPONENT_EMBED);
 | |
|         if (relevantGlobalPropertiesMap == null)
 | |
|         {
 | |
|             return null;
 | |
|         }
 | |
|         return readEmbedMappingProperties(relevantGlobalPropertiesMap.entrySet());
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * A utility method to convert mapping properties to the Map form.
 | |
|      * <p>
 | |
|      * Different from readMappingProperties in that keys are the Alfresco QNames
 | |
|      * and values are file metadata properties.
 | |
|      *
 | |
|      * @see #setMappingProperties(Properties)
 | |
|      */
 | |
|     protected Map<QName, Set<String>> readEmbedMappingProperties(Properties mappingProperties)
 | |
|     {
 | |
|         return readEmbedMappingProperties(mappingProperties.entrySet());
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * A utility method to convert mapping properties entries to the Map form.
 | |
|      * <p>
 | |
|      * Different from readMappingProperties in that keys are the Alfresco QNames
 | |
|      * and values are file metadata properties.
 | |
|      *
 | |
|      * @see #setMappingProperties(Properties)
 | |
|      */
 | |
|     private Map<QName, Set<String>> readEmbedMappingProperties(Set<Entry<Object, Object>> mappingPropertiesEntries)
 | |
|     {
 | |
|         Map<String, String> namespacesByPrefix = new HashMap<String, String>(5);
 | |
|         // Get the namespaces
 | |
|         for (Map.Entry<Object, Object> entry : mappingPropertiesEntries)
 | |
|         {
 | |
|             String propertyName = (String) entry.getKey();
 | |
|             if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX))
 | |
|             {
 | |
|                 String prefix = propertyName.substring(17);
 | |
|                 String namespace = (String) entry.getValue();
 | |
|                 namespacesByPrefix.put(prefix, namespace);
 | |
|             }
 | |
|         }
 | |
|         // Create the mapping
 | |
|         Map<QName, Set<String>> convertedMapping = new HashMap<QName, Set<String>>(17);
 | |
|         for (Map.Entry<Object, Object> entry : mappingPropertiesEntries)
 | |
|         {
 | |
|             String modelProperty = (String) entry.getKey();
 | |
|             String metadataKeysString = (String) entry.getValue();
 | |
|             if (modelProperty.startsWith(NAMESPACE_PROPERTY_PREFIX))
 | |
|             {
 | |
|                 // Ignore these now
 | |
|                 continue;
 | |
|             }
 | |
| 
 | |
|                 int index = modelProperty.indexOf(QName.NAMESPACE_PREFIX);
 | |
|                 if (index > -1 && modelProperty.charAt(0) != QName.NAMESPACE_BEGIN)
 | |
|                 {
 | |
|                     String prefix = modelProperty.substring(0, index);
 | |
|                     String suffix = modelProperty.substring(index + 1);
 | |
|                     // It is prefixed
 | |
|                     String uri = namespacesByPrefix.get(prefix);
 | |
|                     if (uri == null)
 | |
|                     {
 | |
|                         throw new AlfrescoRuntimeException(
 | |
|                                 "No prefix mapping for embed property mapping: \n" +
 | |
|                                 "   Extracter: " + this + "\n" +
 | |
|                                 "   Mapping: " + entry);
 | |
|                     }
 | |
|                     modelProperty = QName.NAMESPACE_BEGIN + uri + QName.NAMESPACE_END + suffix;
 | |
|                 }
 | |
|                 try
 | |
|                 {
 | |
|                     QName qname = QName.createQName(modelProperty);
 | |
|                     String[] metadataKeysArray = metadataKeysString.split(",");
 | |
|                     Set<String> metadataKeys = new HashSet<String>(metadataKeysArray.length);
 | |
|                     for (String metadataKey : metadataKeysArray) {
 | |
|                         metadataKeys.add(metadataKey.trim());
 | |
|                     }
 | |
|                     // Create the entry
 | |
|                     convertedMapping.put(qname, metadataKeys);
 | |
|                 }
 | |
|                 catch (InvalidQNameException e)
 | |
|                 {
 | |
|                     throw new AlfrescoRuntimeException(
 | |
|                             "Can't create metadata embedding property mapping: \n" +
 | |
|                             "   Extracter: " + this + "\n" +
 | |
|                             "   Mapping: " + entry);
 | |
|                 }
 | |
|             if (logger.isTraceEnabled())
 | |
|             {
 | |
|                 logger.trace("Added mapping from " + modelProperty + " to " + metadataKeysString);
 | |
|             }
 | |
|         }
 | |
|         // Done
 | |
|         return convertedMapping;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Registers this instance of the extracter with the registry.  This will call the
 | |
|      * {@link #init()} method and then register if the registry is available.
 | |
|      * 
 | |
|      * @see #setRegistry(MetadataExtracterRegistry)
 | |
|      * @see #init()
 | |
|      */
 | |
|     public final void register()
 | |
|     {
 | |
|         init();
 | |
|         
 | |
|         // Register the extracter, if necessary
 | |
|         if (registry != null)
 | |
|         {
 | |
|             registry.register(this);
 | |
|         }
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * Provides a hook point for implementations to perform initialization.  The base
 | |
|      * implementation must be invoked or the extracter will fail during extraction.
 | |
|      * The {@link #getDefaultMapping() default mappings} will be requested during
 | |
|      * initialization.
 | |
|      */
 | |
|     protected void init()
 | |
|     {
 | |
|         Map<String, Set<QName>> defaultMapping = getDefaultMapping();
 | |
|         if (defaultMapping == null)
 | |
|         {
 | |
|             throw new AlfrescoRuntimeException("The metadata extracter must provide a default mapping: " + this);
 | |
|         }
 | |
|         
 | |
|         // Was a mapping explicitly provided
 | |
|         if (mapping == null)
 | |
|         {
 | |
|             // No mapping, so use the default
 | |
|             mapping = defaultMapping;
 | |
|         }
 | |
|         else if (inheritDefaultMapping)
 | |
|         {
 | |
|             // Merge the default mapping into the configured mapping
 | |
|             for (String documentKey : defaultMapping.keySet())
 | |
|             {
 | |
|                 Set<QName> systemQNames = mapping.get(documentKey);
 | |
|                 if (systemQNames == null)
 | |
|                 {
 | |
|                     systemQNames = new HashSet<QName>(3);
 | |
|                     mapping.put(documentKey, systemQNames);
 | |
|                     Set<QName> defaultQNames = defaultMapping.get(documentKey);
 | |
|                     systemQNames.addAll(defaultQNames);
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         
 | |
|         // Override with any extract mappings specified in global properties
 | |
|         Map<String, Set<QName>> globalExtractMapping = readGlobalExtractMappingProperties();
 | |
|         if (globalExtractMapping != null && globalExtractMapping.size() > 0)
 | |
|         {
 | |
|             for (String documentKey : globalExtractMapping.keySet())
 | |
|             {
 | |
|                 mapping.put(documentKey, globalExtractMapping.get(documentKey));
 | |
|             }
 | |
|         }
 | |
|         
 | |
|         // The configured mappings are empty, but there were default mappings
 | |
|         if (mapping.size() == 0 && defaultMapping.size() > 0)
 | |
|         {
 | |
|             logger.warn(
 | |
|                     "There are no property mappings for the metadata extracter.\n" +
 | |
|                     "  Nothing will be extracted by: " + this);
 | |
|         }
 | |
| 
 | |
|         if (executorService == null)
 | |
|         {
 | |
|             executorService = Executors.newCachedThreadPool();
 | |
|         }
 | |
|         
 | |
|         if (mimetypeLimits == null)
 | |
|         {
 | |
|             if (properties != null)
 | |
|             {
 | |
|                 String property = properties.getProperty(PROP_DEFAULT_TIMEOUT);
 | |
|                 if (property != null)
 | |
|                 {
 | |
|                     Long value = Long.parseLong(property);
 | |
|                     if (value != null)
 | |
|                     {
 | |
|                         MetadataExtracterLimits limits = new MetadataExtracterLimits();
 | |
|                         limits.setTimeoutMs(value);
 | |
|                         mimetypeLimits = new HashMap<String, MetadataExtracterLimits>(1);
 | |
|                         mimetypeLimits.put("*", limits);
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         Map<QName, Set<String>> defaultEmbedMapping = getDefaultEmbedMapping();
 | |
| 
 | |
|         // Was a mapping explicitly provided
 | |
|         if (embedMapping == null)
 | |
|         {
 | |
|             // No mapping, so use the default
 | |
|             embedMapping = defaultEmbedMapping;
 | |
|         }
 | |
|         
 | |
|         else if (inheritDefaultEmbedMapping)
 | |
|         {
 | |
|             // Merge the default mapping into the configured mapping
 | |
|             for (QName modelProperty : defaultEmbedMapping.keySet())
 | |
|             {
 | |
|                 Set<String> metadataKeys = embedMapping.get(modelProperty);
 | |
|                 if (metadataKeys == null)
 | |
|                 {
 | |
|                     metadataKeys = new HashSet<String>(3);
 | |
|                     embedMapping.put(modelProperty, metadataKeys);
 | |
|                     Set<String> defaultMetadataKeys = defaultEmbedMapping.get(modelProperty);
 | |
|                     metadataKeys.addAll(defaultMetadataKeys);
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         
 | |
|         // Override with any embed mappings specified in global properties
 | |
|         Map<QName, Set<String>> globalEmbedMapping = readGlobalEmbedMappingProperties();
 | |
|         if (globalEmbedMapping != null && globalEmbedMapping.size() > 0)
 | |
|         {
 | |
|             for (QName modelProperty : globalEmbedMapping.keySet())
 | |
|             {
 | |
|                 embedMapping.put(modelProperty, globalEmbedMapping.get(modelProperty));
 | |
|             }
 | |
|         }
 | |
|         // Done
 | |
|         initialized = true;
 | |
|     }
 | |
| 
 | |
|     /** {@inheritDoc} */
 | |
|     public long getExtractionTime()
 | |
|     {
 | |
|         return 1000L;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Checks if the mimetype is supported.
 | |
|      * 
 | |
|      * @param reader the reader to check
 | |
|      * @throws AlfrescoRuntimeException if the mimetype is not supported
 | |
|      */
 | |
|     protected void checkIsSupported(ContentReader reader)
 | |
|     {
 | |
|         String mimetype = reader.getMimetype();
 | |
|         if (!isSupported(mimetype))
 | |
|         {
 | |
|             throw new AlfrescoRuntimeException(
 | |
|                     "Metadata extracter does not support mimetype: " + mimetype + "\n" +
 | |
|                     "   reader: " + reader + "\n" +
 | |
|                     "   supported: " + supportedMimetypes + "\n" +
 | |
|                     "   extracter: " + this);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Checks if embedding for the mimetype is supported.
 | |
|      *
 | |
|      * @param writer the writer to check
 | |
|      * @throws AlfrescoRuntimeException if embedding for the mimetype is not supported
 | |
|      */
 | |
|     protected void checkIsEmbedSupported(ContentWriter writer)
 | |
|     {
 | |
|         String mimetype = writer.getMimetype();
 | |
|         if (!isEmbeddingSupported(mimetype))
 | |
|         {
 | |
|             throw new AlfrescoRuntimeException(
 | |
|                     "Metadata extracter does not support embedding mimetype: \n" +
 | |
|                     "   writer: " + writer + "\n" +
 | |
|                     "   supported: " + supportedEmbedMimetypes + "\n" +
 | |
|                     "   extracter: " + this);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * {@inheritDoc}
 | |
|      */
 | |
|     public final Map<QName, Serializable> extract(ContentReader reader, Map<QName, Serializable> destination)
 | |
|     {
 | |
|         return extract(reader, this.overwritePolicy, destination, this.mapping);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * {@inheritDoc}
 | |
|      */
 | |
|     public final Map<QName, Serializable> extract(
 | |
|             ContentReader reader,
 | |
|             OverwritePolicy overwritePolicy,
 | |
|             Map<QName, Serializable> destination)
 | |
|     {
 | |
|         return extract(reader, overwritePolicy, destination, this.mapping);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * {@inheritDoc}
 | |
|      */
 | |
|     public Map<QName, Serializable> extract(
 | |
|             ContentReader reader,
 | |
|             OverwritePolicy overwritePolicy,
 | |
|             Map<QName, Serializable> destination,
 | |
|             Map<String, Set<QName>> mapping)
 | |
|     {
 | |
|         // Done
 | |
|         if (logger.isDebugEnabled())
 | |
|         {
 | |
|             logger.debug("Starting metadata extraction: \n" +
 | |
|                     "   reader: " + reader + "\n" +
 | |
|                     "   extracter: " + this);
 | |
|         }
 | |
| 
 | |
|         if (!initialized)
 | |
|         {
 | |
|             throw new AlfrescoRuntimeException(
 | |
|                     "Metadata extracter not initialized.\n" +
 | |
|                     "  Call the 'register' method on: " + this + "\n" +
 | |
|                     "  Implementations of the 'init' method must call the base implementation.");
 | |
|         }
 | |
|         // check the reliability
 | |
|         checkIsSupported(reader);
 | |
|         
 | |
|         Map<QName, Serializable> changedProperties = null;
 | |
|         try
 | |
|         {
 | |
|             Map<String, Serializable> rawMetadata = null;
 | |
|             // Check that the content has some meat
 | |
|             if (reader.getSize() > 0 && reader.exists())
 | |
|             {
 | |
|                 rawMetadata = extractRaw(reader, getLimits(reader.getMimetype()));
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                 rawMetadata = new HashMap<String, Serializable>(1);
 | |
|             }
 | |
|             // Convert to system properties (standalone)
 | |
|             Map<QName, Serializable> systemProperties = mapRawToSystem(rawMetadata);
 | |
|             // Convert the properties according to the dictionary types
 | |
|             systemProperties = convertSystemPropertyValues(systemProperties);
 | |
|             // Last chance to filter the system properties map before applying them            
 | |
|             filterSystemProperties(systemProperties, destination);            
 | |
|             // Now use the proper overwrite policy
 | |
|             changedProperties = overwritePolicy.applyProperties(systemProperties, destination);
 | |
|             
 | |
|             if(logger.isDebugEnabled())
 | |
|             {
 | |
|                logger.debug("Extracted Metadata from " + reader + "\n  Found: " +
 | |
|                             rawMetadata + "\n  Mapped and Accepted: " + changedProperties);
 | |
|             }
 | |
|         }
 | |
|         catch (Throwable e)
 | |
|         {
 | |
|             // Ask Tika to detect the document, and report back on if
 | |
|             //  the current mime type is plausible
 | |
|             String typeErrorMessage = null;
 | |
|             String differentType = null;
 | |
|             if(mimetypeService != null)
 | |
|             {
 | |
|                differentType = mimetypeService.getMimetypeIfNotMatches(reader.getReader());
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                logger.info("Unable to verify mimetype of " + reader.getReader() + 
 | |
|                            " as no MimetypeService available to " + getClass().getName());
 | |
|             }
 | |
|             if(differentType != null)
 | |
|             {
 | |
|                typeErrorMessage = "\n" +
 | |
|                   "   claimed mime type: " + reader.getMimetype() + "\n" +
 | |
|                   "   detected mime type: " + differentType;
 | |
|             }
 | |
|            
 | |
|             if (logger.isDebugEnabled())
 | |
|             {
 | |
|                 logger.debug(
 | |
|                         "Metadata extraction failed: \n" +
 | |
|                         "   Extracter: " + this + "\n" +
 | |
|                         "   Content:   " + reader +
 | |
|                         typeErrorMessage,
 | |
|                         e);
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                 logger.warn(
 | |
|                         "Metadata extraction failed (turn on DEBUG for full error): \n" +
 | |
|                         "   Extracter: " + this + "\n" +
 | |
|                         "   Content:   " + reader + "\n" +
 | |
|                         "   Failure:   " + e.getMessage() +
 | |
|                         typeErrorMessage);
 | |
|             }
 | |
|         }
 | |
|         finally
 | |
|         {
 | |
|             // check that the reader was closed (if used)
 | |
|             if (reader.isChannelOpen())
 | |
|             {
 | |
|                 logger.error("Content reader not closed by metadata extracter: \n" +
 | |
|                         "   reader: " + reader + "\n" +
 | |
|                         "   extracter: " + this);
 | |
|             }
 | |
|             // Make sure that we have something to return
 | |
|             if (changedProperties == null)
 | |
|             {
 | |
|                 changedProperties = new HashMap<QName, Serializable>(0);
 | |
|             }
 | |
|         }
 | |
|         
 | |
|         // Done
 | |
|         if (logger.isDebugEnabled())
 | |
|         {
 | |
|             logger.debug("Completed metadata extraction: \n" +
 | |
|                     "   reader:    " + reader + "\n" +
 | |
|                     "   extracter: " + this + "\n" +
 | |
|                     "   changed:   " + changedProperties);
 | |
|         }
 | |
|         return changedProperties;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * {@inheritDoc}
 | |
|      */
 | |
|     public final void embed(
 | |
|             Map<QName, Serializable> properties,
 | |
|             ContentReader reader,
 | |
|             ContentWriter writer)
 | |
|     {
 | |
|         // Done
 | |
|         if (logger.isDebugEnabled())
 | |
|         {
 | |
|             logger.debug("Starting metadata embedding: \n" +
 | |
|                     "   reader: " + reader + "\n" +
 | |
|                     "   writer: " + writer + "\n" +
 | |
|                     "   extracter: " + this);
 | |
|         }
 | |
| 
 | |
|         if (!initialized)
 | |
|         {
 | |
|             throw new AlfrescoRuntimeException(
 | |
|                     "Metadata extracter not initialized.\n" +
 | |
|                     "  Call the 'register' method on: " + this + "\n" +
 | |
|                     "  Implementations of the 'init' method must call the base implementation.");
 | |
|         }
 | |
|         // check the reliability
 | |
|         checkIsEmbedSupported(writer);
 | |
| 
 | |
|         try
 | |
|         {
 | |
|             embedInternal(mapSystemToRaw(properties), reader, writer);
 | |
|             if(logger.isDebugEnabled())
 | |
|             {
 | |
|                logger.debug("Embedded Metadata into " + writer);
 | |
|             }
 | |
|         }
 | |
|         catch (Throwable e)
 | |
|         {
 | |
|             // Ask Tika to detect the document, and report back on if
 | |
|             //  the current mime type is plausible
 | |
|             String typeErrorMessage = "";
 | |
|             String differentType = null;
 | |
|             if(mimetypeService != null)
 | |
|             {
 | |
|                try
 | |
|                {
 | |
|                    differentType = mimetypeService.getMimetypeIfNotMatches(writer.getReader());
 | |
|                }
 | |
|                catch (ContentIOException cioe)
 | |
|                {
 | |
|                    // Embedding failed and writer is empty
 | |
|                }
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                logger.info("Unable to verify mimetype of " + writer.getReader() +
 | |
|                            " as no MimetypeService available to " + getClass().getName());
 | |
|             }
 | |
|             if(differentType != null)
 | |
|             {
 | |
|                typeErrorMessage = "\n" +
 | |
|                   "   claimed mime type: " + writer.getMimetype() + "\n" +
 | |
|                   "   detected mime type: " + differentType;
 | |
|             }
 | |
| 
 | |
|             if (logger.isDebugEnabled())
 | |
|             {
 | |
|                 logger.debug(
 | |
|                         "Metadata embedding failed: \n" +
 | |
|                         "   Extracter: " + this + "\n" +
 | |
|                         "   Content:   " + writer +
 | |
|                         typeErrorMessage,
 | |
|                         e);
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                 logger.error(
 | |
|                         "Metadata embedding failed (turn on DEBUG for full error): \n" +
 | |
|                         "   Extracter: " + this + "\n" +
 | |
|                         "   Content:   " + writer + "\n" +
 | |
|                         "   Failure:   " + e.getMessage() +
 | |
|                         typeErrorMessage);
 | |
|             }
 | |
|         }
 | |
|         finally
 | |
|         {
 | |
|             // check that the writer was closed (if used)
 | |
|             if (writer.isChannelOpen())
 | |
|             {
 | |
|                 logger.error("Content writer not closed by metadata extracter: \n" +
 | |
|                         "   writer: " + writer + "\n" +
 | |
|                         "   extracter: " + this);
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // Done
 | |
|         if (logger.isDebugEnabled())
 | |
|         {
 | |
|             logger.debug("Completed metadata embedding: \n" +
 | |
|                     "   writer:    " + writer + "\n" +
 | |
|                     "   extracter: " + this);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * 
 | |
|      * @param rawMetadata   Metadata keyed by document properties
 | |
|      * @return              Returns the metadata keyed by the system properties
 | |
|      */
 | |
|     private Map<QName, Serializable> mapRawToSystem(Map<String, Serializable> rawMetadata)
 | |
|     {
 | |
|         Map<QName, Serializable> systemProperties = new HashMap<QName, Serializable>(rawMetadata.size() * 2 + 1);
 | |
|         for (Map.Entry<String, Serializable> entry : rawMetadata.entrySet())
 | |
|         {
 | |
|             String documentKey = entry.getKey();
 | |
|             // Check if there is a mapping for this
 | |
|             if (!mapping.containsKey(documentKey))
 | |
|             {
 | |
|                 // No mapping - ignore
 | |
|                 continue;
 | |
|             }
 | |
|             Serializable documentValue = entry.getValue();
 | |
|             Set<QName> systemQNames = mapping.get(documentKey);
 | |
|             for (QName systemQName : systemQNames)
 | |
|             {
 | |
|                 systemProperties.put(systemQName, documentValue);                
 | |
|             }
 | |
|         }
 | |
|         // Done
 | |
|         if (logger.isDebugEnabled())
 | |
|         {
 | |
|             logger.debug(
 | |
|                     "Converted extracted raw values to system values: \n" +
 | |
|                     "   Raw Properties:    " + rawMetadata + "\n" +
 | |
|                     "   System Properties: " + systemProperties);
 | |
|         }
 | |
|         return systemProperties;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      *
 | |
|      * @param systemMetadata   Metadata keyed by system properties
 | |
|      * @return              Returns the metadata keyed by the content file metadata properties
 | |
|      */
 | |
|     private Map<String, Serializable> mapSystemToRaw(Map<QName, Serializable> systemMetadata)
 | |
|     {
 | |
|         Map<String, Serializable> metadataProperties = new HashMap<String, Serializable>(systemMetadata.size() * 2 + 1);
 | |
|         for (Map.Entry<QName, Serializable> entry : systemMetadata.entrySet())
 | |
|         {
 | |
|             QName modelProperty = entry.getKey();
 | |
|             // Check if there is a mapping for this
 | |
|             if (!embedMapping.containsKey(modelProperty))
 | |
|             {
 | |
|                 // No mapping - ignore
 | |
|                 continue;
 | |
|             }
 | |
|             Serializable documentValue = entry.getValue();
 | |
|             Set<String> metadataKeys = embedMapping.get(modelProperty);
 | |
|             for (String metadataKey : metadataKeys)
 | |
|             {
 | |
|                 metadataProperties.put(metadataKey, documentValue);
 | |
|             }
 | |
|         }
 | |
|         // Done
 | |
|         if (logger.isDebugEnabled())
 | |
|         {
 | |
|             logger.debug(
 | |
|                     "Converted system model values to metadata values: \n" +
 | |
|                     "   System Properties:    " + systemMetadata + "\n" +
 | |
|                     "   Metadata Properties: " + metadataProperties);
 | |
|         }
 | |
|         return metadataProperties;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Filters the system properties that are going to be applied.  Gives the metadata extracter an 
 | |
|      * opportunity to remove properties that may not be appropriate in a given context.
 | |
|      * 
 | |
|      * @param systemProperties  map of system properties to be applied
 | |
|      * @param targetProperties  map of target properties, may be used to provide to the context requried
 | |
|      */
 | |
|     protected void filterSystemProperties(Map<QName, Serializable> systemProperties, Map<QName, Serializable> targetProperties)
 | |
|     {
 | |
|         // Default implementation does nothing
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * Converts all values according to their dictionary-defined type.  This uses the
 | |
|      * {@link #setFailOnTypeConversion(boolean) failOnTypeConversion flag} to determine how failures
 | |
|      * are handled i.e. if values fail to convert, the process may discard the property.
 | |
|      * 
 | |
|      * @param systemProperties  the values keyed to system property names
 | |
|      * @return                  Returns a modified map of properties that have been converted.
 | |
|      */
 | |
|     @SuppressWarnings("unchecked")
 | |
|     private Map<QName, Serializable> convertSystemPropertyValues(Map<QName, Serializable> systemProperties)
 | |
|     {
 | |
|         Map<QName, Serializable> convertedProperties = new HashMap<QName, Serializable>(systemProperties.size() + 7);
 | |
|         for (Map.Entry<QName, Serializable> entry : systemProperties.entrySet())
 | |
|         {
 | |
|             QName propertyQName = entry.getKey();
 | |
|             Serializable propertyValue = entry.getValue();
 | |
|             // Get the property definition
 | |
|             PropertyDefinition propertyDef = (dictionaryService == null) ? null : dictionaryService.getProperty(propertyQName);
 | |
|             if (propertyDef == null)
 | |
|             {
 | |
|                 // There is nothing in the DD about this so just transfer it
 | |
|                 convertedProperties.put(propertyQName, propertyValue);
 | |
|                 continue;
 | |
|             }
 | |
|             // It is in the DD, so attempt the conversion
 | |
|             DataTypeDefinition propertyTypeDef = propertyDef.getDataType();
 | |
|             Serializable convertedPropertyValue = null;
 | |
|             
 | |
|             try
 | |
|             {
 | |
|                 // Attempt to make any date conversions
 | |
|                 if (propertyTypeDef.getName().equals(DataTypeDefinition.DATE) || propertyTypeDef.getName().equals(DataTypeDefinition.DATETIME))
 | |
|                 {
 | |
|                     if (propertyValue instanceof Date)
 | |
|                     {
 | |
|                         convertedPropertyValue = propertyValue;
 | |
|                     }
 | |
|                     else if (propertyValue instanceof Collection)
 | |
|                     {
 | |
|                         convertedPropertyValue = (Serializable) makeDates((Collection<String>) propertyValue);
 | |
|                     }
 | |
|                     else if (propertyValue instanceof String)
 | |
|                     {
 | |
|                         convertedPropertyValue = makeDate((String) propertyValue);
 | |
|                     }
 | |
|                     else if (propertyValue == null)
 | |
|                     {
 | |
|                         convertedPropertyValue = null;
 | |
|                     }
 | |
|                     else
 | |
|                     {
 | |
|                         if (logger.isWarnEnabled())
 | |
|                         {
 | |
|                             StringBuilder mesg = new StringBuilder();
 | |
|                             mesg.append("Unable to convert Date property: ").append(propertyQName)
 | |
|                                 .append(", value: ").append(propertyValue).append(", type: ").append(propertyTypeDef.getName());
 | |
|                             logger.warn(mesg.toString());
 | |
|                         }
 | |
|                     }
 | |
|                 }
 | |
|                 else
 | |
|                 {
 | |
|                     if (propertyValue instanceof Collection)
 | |
|                     {
 | |
|                         convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert(
 | |
|                                 propertyTypeDef,
 | |
|                                 (Collection<?>) propertyValue);
 | |
|                     }
 | |
|                     else if (propertyValue instanceof Object[])
 | |
|                     {
 | |
|                        convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert(
 | |
|                              propertyTypeDef,
 | |
|                              (Object[]) propertyValue);
 | |
|                     }
 | |
|                     else
 | |
|                     {
 | |
|                         convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert(
 | |
|                                 propertyTypeDef,
 | |
|                                 propertyValue);
 | |
|                     }
 | |
|                 }
 | |
|                 convertedProperties.put(propertyQName, convertedPropertyValue);
 | |
|             }
 | |
|             catch (TypeConversionException e)
 | |
|             {
 | |
|                 logger.warn(
 | |
|                         "Type conversion failed during metadata extraction: \n" + 
 | |
|                         "   Failure:   " + e.getMessage() + "\n" +
 | |
|                         "   Type:      " + propertyTypeDef + "\n" +
 | |
|                         "   Value:     " + propertyValue);
 | |
|                 // Do we just absorb this or is it a problem?
 | |
|                 if (failOnTypeConversion)
 | |
|                 {
 | |
|                     throw AlfrescoRuntimeException.create(
 | |
|                             e,
 | |
|                             ERR_TYPE_CONVERSION,
 | |
|                             this,
 | |
|                             propertyQName,
 | |
|                             propertyTypeDef.getName(),
 | |
|                             propertyValue);
 | |
|                 }
 | |
|             }
 | |
|             catch (MalformedNodeRefException e)
 | |
|             {
 | |
|                 if (propertyQName.equals(ContentModel.PROP_TAGS))
 | |
|                 {
 | |
|                     if (enableStringTagging)
 | |
|                     {
 | |
|                         // We must want to map tag string values instead of nodeRefs
 | |
|                         // ContentMetadataExtracter will take care of tagging by string
 | |
|                         ArrayList<Object> list = new ArrayList<Object>(1);
 | |
|                         if (propertyValue instanceof Object[])
 | |
|                         {
 | |
|                             for (Object value : (Object[]) propertyValue)
 | |
|                             {
 | |
|                                 list.add(value);
 | |
|                             }
 | |
|                         }
 | |
|                         else
 | |
|                         {
 | |
|                             list.add(propertyValue);
 | |
|                         }
 | |
|                         convertedProperties.put(propertyQName, list);
 | |
|                     }
 | |
|                     else
 | |
|                     {
 | |
|                         if (logger.isInfoEnabled())
 | |
|                         {
 | |
|                             logger.info("enableStringTagging is false and could not convert " + 
 | |
|                                     propertyQName.toString() + ": " + e.getMessage());
 | |
|                         }
 | |
|                     }
 | |
|                 }
 | |
|                 else
 | |
|                 {
 | |
|                     if (failOnTypeConversion)
 | |
|                     {
 | |
|                         throw e;
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         // Done
 | |
|         return convertedProperties;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Convert a collection of date <tt>String</tt> to <tt>Date</tt> objects
 | |
|      */
 | |
|     private Collection<Date> makeDates(Collection<String> dateStrs)
 | |
|     {
 | |
|         List<Date> dates = new ArrayList<Date>(dateStrs.size());
 | |
|         for (String dateStr : dateStrs)
 | |
|         {
 | |
|             Date date = makeDate(dateStr);
 | |
|             dates.add(date);
 | |
|         }
 | |
|         return dates;
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * Convert a date <tt>String</tt> to a <tt>Date</tt> object
 | |
|      */
 | |
|     protected Date makeDate(String dateStr)
 | |
|     {
 | |
|         if (dateStr == null || dateStr.length() == 0)
 | |
|         {
 | |
|             return null;
 | |
|         }
 | |
|         
 | |
|         Date date = null;
 | |
|         try
 | |
|         {
 | |
|             date = DefaultTypeConverter.INSTANCE.convert(Date.class, dateStr);
 | |
|         }
 | |
|         catch (TypeConversionException e)
 | |
|         {
 | |
|             // Try one of the other formats
 | |
|             if (this.supportedDateFormatters != null)
 | |
|             {
 | |
|                 // Remove text such as " (PDT)" which cannot be parsed.
 | |
|                 String dateStr2 = (dateStr == null || dateStr.indexOf('(') == -1)
 | |
|                         ? dateStr : dateStr.replaceAll(" \\(.*\\)", "");
 | |
|                 for (DateTimeFormatter supportedDateFormatter: supportedDateFormatters)
 | |
|                 {
 | |
|                     // supported DateFormats were defined
 | |
|                     /**
 | |
|                      * Regional date format
 | |
|                      */
 | |
|                     try
 | |
|                     {
 | |
|                         DateTime dateTime = supportedDateFormatter.parseDateTime(dateStr2);
 | |
|                         if (dateTime.getCenturyOfEra() > 0)
 | |
|                         {
 | |
|                             return dateTime.toDate();
 | |
|                         }
 | |
|                     }
 | |
|                     catch (IllegalArgumentException e1)
 | |
|                     {
 | |
|                         // Didn't work
 | |
|                     }
 | |
| 
 | |
|                     /**
 | |
|                      * Date format can be locale specific - make sure English format always works
 | |
|                      */
 | |
|                     /* 
 | |
|                      * TODO MER 25 May 2010 - Added this as a quick fix for IMAP date parsing which is always 
 | |
|                      * English regardless of Locale.  Some more thought and/or code is required to configure 
 | |
|                      * the relationship between properties, format and locale.
 | |
|                      */
 | |
|                     try
 | |
|                     {
 | |
|                         DateTime dateTime = supportedDateFormatter.withLocale(Locale.US).parseDateTime(dateStr2);
 | |
|                         if (dateTime.getCenturyOfEra() > 0)
 | |
|                         {
 | |
|                             return dateTime.toDate();
 | |
|                         }
 | |
|                     }
 | |
|                     catch (IllegalArgumentException e1)
 | |
|                     {
 | |
|                         // Didn't work
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             if (date == null)
 | |
|             {
 | |
|                 // Still no luck
 | |
|                 throw new TypeConversionException("Unable to convert string to date: " + dateStr);
 | |
|             }
 | |
|         }
 | |
|         return date;
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * Adds a value to the map, conserving null values.  Values are converted to null if:
 | |
|      * <ul>
 | |
|      *   <li>it is an empty string value after trimming</li>
 | |
|      *   <li>it is an empty collection</li>
 | |
|      *   <li>it is an empty array</li>
 | |
|      * </ul>
 | |
|      * String values are trimmed before being put into the map.
 | |
|      * Otherwise, it is up to the extracter to ensure that the value is a <tt>Serializable</tt>.
 | |
|      * It is not appropriate to implicitly convert values in order to make them <tt>Serializable</tt>
 | |
|      * - the best conversion method will depend on the value's specific meaning.
 | |
|      * 
 | |
|      * @param key           the destination key
 | |
|      * @param value         the serializable value
 | |
|      * @param destination   the map to put values into
 | |
|      * @return              Returns <tt>true</tt> if set, otherwise <tt>false</tt>
 | |
|      */
 | |
|     protected boolean putRawValue(String key, Serializable value, Map<String, Serializable> destination)
 | |
|     {
 | |
|         if (value == null)
 | |
|         {
 | |
|             // Just keep this
 | |
|         }
 | |
|         else if (value instanceof String)
 | |
|         {
 | |
|             String valueStr = ((String) value).trim();
 | |
|             if (valueStr.length() == 0)
 | |
|             {
 | |
|                 value = null;
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                 if(valueStr.indexOf("\u0000") != -1)
 | |
|                 {
 | |
|                     valueStr = valueStr.replaceAll("\u0000", "");
 | |
|                 }
 | |
|                 // Keep the trimmed value
 | |
|                 value = valueStr;
 | |
|             }
 | |
|         }
 | |
|         else if (value instanceof Collection)
 | |
|         {
 | |
|             Collection<?> valueCollection = (Collection<?>) value;
 | |
|             if (valueCollection.isEmpty())
 | |
|             {
 | |
|                 value = null;
 | |
|             }
 | |
|         }
 | |
|         else if (value.getClass().isArray())
 | |
|         {
 | |
|             if (Array.getLength(value) == 0)
 | |
|             {
 | |
|                 value = null;
 | |
|             }
 | |
|         }
 | |
|         // It passed all the tests
 | |
|         destination.put(key, value);
 | |
|         return true;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Helper method to fetch a clean map into which raw values can be dumped.
 | |
|      * 
 | |
|      * @return          Returns an empty map
 | |
|      */
 | |
|     protected final Map<String, Serializable> newRawMap()
 | |
|     {
 | |
|         return new HashMap<String, Serializable>(17);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * This method provides a <i>best guess</i> of where to store the values extracted
 | |
|      * from the documents.  The list of properties mapped by default need <b>not</b>
 | |
|      * include all properties extracted from the document; just the obvious set of mappings
 | |
|      * need be supplied.
 | |
|      * Implementations must either provide the default mapping properties in the expected
 | |
|      * location or override the method to provide the default mapping.
 | |
|      * <p>
 | |
|      * The default implementation looks for the default mapping file in the location
 | |
|      * given by the class name and <i>.properties</i>.  If the extracter's class is
 | |
|      * <b>x.y.z.MyExtracter</b> then the default properties will be picked up at
 | |
|      * <b>classpath:/alfresco/metadata/MyExtracter.properties</b>.
 | |
|      * The previous location of <b>classpath:/x/y/z/MyExtracter.properties</b> is
 | |
|      * still supported but may be removed in a future release.
 | |
|      * Inner classes are supported, but the '$' in the class name is replaced with '-', so
 | |
|      * default properties for <b>x.y.z.MyStuff$MyExtracter</b> will be located using
 | |
|      * <b>classpath:/alfresco/metadata/MyStuff-MyExtracter.properties</b>.
 | |
|      * <p>
 | |
|      * The default mapping implementation should include thorough Javadocs so that the
 | |
|      * system administrators can accurately determine how to best enhance or override the
 | |
|      * default mapping.
 | |
|      * <p>
 | |
|      * If the default mapping is declared in a properties file other than the one named after
 | |
|      * the class, then the {@link #readMappingProperties(String)} method can be used to quickly
 | |
|      * generate the return value:
 | |
|      * <pre><code>
 | |
|      *      protected Map<<String, Set<QName>> getDefaultMapping()
 | |
|      *      {
 | |
|      *          return readMappingProperties(DEFAULT_MAPPING);
 | |
|      *      }
 | |
|      * </code></pre>
 | |
|      * The map can also be created in code either statically or during the call.
 | |
|      * 
 | |
|      * @return              Returns the default, static mapping.  It may not be null.
 | |
|      * 
 | |
|      * @see #setInheritDefaultMapping(boolean inherit)
 | |
|      */
 | |
|     protected Map<String, Set<QName>> getDefaultMapping()
 | |
|     {
 | |
|         AlfrescoRuntimeException metadataLocationReadException = null;
 | |
|         try
 | |
|         {
 | |
|             // Can't use getSimpleName here because we lose inner class $ processing
 | |
|             String className = this.getClass().getName();
 | |
|             String shortClassName = className.split("\\.")[className.split("\\.").length - 1];
 | |
|             // Replace $
 | |
|             shortClassName = shortClassName.replace('$', '-');
 | |
|             // Append .properties
 | |
|             String metadataPropertiesUrl = "alfresco/metadata/" + shortClassName + ".properties";
 | |
|             // Attempt to load the properties
 | |
|             return readMappingProperties(metadataPropertiesUrl);
 | |
|         }
 | |
|         catch (AlfrescoRuntimeException e)
 | |
|         {
 | |
|             // We'll save this to throw at someone later
 | |
|             metadataLocationReadException = e;
 | |
|         }
 | |
|         // Try package location
 | |
|         try
 | |
|         {
 | |
|             String canonicalClassName = this.getClass().getName();
 | |
|             // Replace $
 | |
|             canonicalClassName = canonicalClassName.replace('$', '-');
 | |
|             // Replace .
 | |
|             canonicalClassName = canonicalClassName.replace('.', '/');
 | |
|             // Append .properties
 | |
|             String packagePropertiesUrl = canonicalClassName + ".properties";
 | |
|             // Attempt to load the properties
 | |
|             return readMappingProperties(packagePropertiesUrl);
 | |
|         }
 | |
|         catch (AlfrescoRuntimeException e)
 | |
|         {
 | |
|             // Not found in either location, but we want to throw the error for the new metadata location
 | |
|             throw metadataLocationReadException;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * This method provides a <i>best guess</i> of what model properties should be embedded
 | |
|      * in content.  The list of properties mapped by default need <b>not</b>
 | |
|      * include all properties to be embedded in the document; just the obvious set of mappings
 | |
|      * need be supplied.
 | |
|      * Implementations must either provide the default mapping properties in the expected
 | |
|      * location or override the method to provide the default mapping.
 | |
|      * <p>
 | |
|      * The default implementation looks for the default mapping file in the location
 | |
|      * given by the class name and <i>.embed.properties</i>.  If the extracter's class is
 | |
|      * <b>x.y.z.MyExtracter</b> then the default properties will be picked up at
 | |
|      * <b>classpath:/x/y/z/MyExtracter.embed.properties</b>.
 | |
|      * Inner classes are supported, but the '$' in the class name is replaced with '-', so
 | |
|      * default properties for <b>x.y.z.MyStuff$MyExtracter</b> will be located using
 | |
|      * <b>x.y.z.MyStuff-MyExtracter.embed.properties</b>.
 | |
|      * <p>
 | |
|      * The default mapping implementation should include thorough Javadocs so that the
 | |
|      * system administrators can accurately determine how to best enhance or override the
 | |
|      * default mapping.
 | |
|      * <p>
 | |
|      * If the default mapping is declared in a properties file other than the one named after
 | |
|      * the class, then the {@link #readEmbedMappingProperties(String)} method can be used to quickly
 | |
|      * generate the return value:
 | |
|      * <pre><code>
 | |
|      *      protected Map<<String, Set<QName>> getDefaultMapping()
 | |
|      *      {
 | |
|      *          return readEmbedMappingProperties(DEFAULT_MAPPING);
 | |
|      *      }
 | |
|      * </code></pre>
 | |
|      * The map can also be created in code either statically or during the call.
 | |
|      * <p>
 | |
|      * If no embed mapping properties file is found a reverse of the extract
 | |
|      * mapping in {@link #getDefaultMapping()} will be assumed with the first QName in each
 | |
|      * value used as the key for this mapping and a last win approach for duplicates.
 | |
|      *
 | |
|      * @return              Returns the default, static embed mapping.  It may not be null.
 | |
|      *
 | |
|      * @see #setInheritDefaultMapping(boolean inherit)
 | |
|      */
 | |
|     protected Map<QName, Set<String>> getDefaultEmbedMapping()
 | |
|     {
 | |
|         Map<QName, Set<String>> embedMapping = null;
 | |
|         String metadataPropertiesUrl = null;
 | |
|         try
 | |
|         {
 | |
|             // Can't use getSimpleName here because we lose inner class $ processing
 | |
|             String className = this.getClass().getName();
 | |
|             String shortClassName = className.split("\\.")[className.split("\\.").length - 1];
 | |
|             // Replace $
 | |
|             shortClassName = shortClassName.replace('$', '-');
 | |
|             // Append .properties
 | |
|             metadataPropertiesUrl = "alfresco/metadata/" + shortClassName + ".embed.properties";
 | |
|             // Attempt to load the properties
 | |
|             embedMapping = readEmbedMappingProperties(metadataPropertiesUrl);
 | |
|         }
 | |
|         catch (AlfrescoRuntimeException e)
 | |
|         {
 | |
|             // No embed mapping found at default location
 | |
|         }
 | |
|         // Try package location
 | |
|         try
 | |
|         {
 | |
|             String canonicalClassName = this.getClass().getName();
 | |
|             // Replace $
 | |
|             canonicalClassName = canonicalClassName.replace('$', '-');
 | |
|             // Replace .
 | |
|             canonicalClassName = canonicalClassName.replace('.', '/');
 | |
|             // Append .properties
 | |
|             String packagePropertiesUrl = canonicalClassName + ".embed.properties";
 | |
|             // Attempt to load the properties
 | |
|             embedMapping = readEmbedMappingProperties(packagePropertiesUrl);
 | |
|         }
 | |
|         catch (AlfrescoRuntimeException e)
 | |
|         {
 | |
|             // No embed mapping found at legacy location
 | |
|         }
 | |
|         if (embedMapping == null)
 | |
|         {
 | |
|             if (logger.isDebugEnabled())
 | |
|             {
 | |
|                 logger.debug("No explicit embed mapping properties found at: " + metadataPropertiesUrl + ", assuming reverse of extract mapping");
 | |
|             }
 | |
|             Map<String, Set<QName>> extractMapping = this.mapping;
 | |
|             if (extractMapping == null || extractMapping.size() == 0)
 | |
|             {
 | |
|                 extractMapping = getDefaultMapping();
 | |
|             }
 | |
|             embedMapping = new HashMap<QName, Set<String>>(extractMapping.size());
 | |
|             for (String metadataKey : extractMapping.keySet())
 | |
|             {
 | |
|                 if (extractMapping.get(metadataKey) != null && extractMapping.get(metadataKey).size() > 0)
 | |
|                 {
 | |
|                     QName modelProperty = extractMapping.get(metadataKey).iterator().next();
 | |
|                     Set<String> metadataKeys = embedMapping.get(modelProperty);
 | |
|                     if (metadataKeys == null)
 | |
|                     {
 | |
|                         metadataKeys = new HashSet<String>(1);
 | |
|                         embedMapping.put(modelProperty, metadataKeys);
 | |
|                     }
 | |
|                     metadataKeys.add(metadataKey);
 | |
|                     if (logger.isTraceEnabled())
 | |
|                     {
 | |
|                         logger.trace("Added mapping from " + modelProperty + " to " + metadataKeys.toString());
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         return embedMapping;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Gets the metadata extracter limits for the given mimetype.
 | |
|      * <p>
 | |
|      * A specific match for the given mimetype is tried first and
 | |
|      * if none is found a wildcard of "*" is tried.
 | |
|      * 
 | |
|      * @param mimetype String
 | |
|      * @return the found limits or null
 | |
|      */
 | |
|     protected MetadataExtracterLimits getLimits(String mimetype)
 | |
|     {
 | |
|         if (mimetypeLimits == null)
 | |
|         {
 | |
|             return null;
 | |
|         }
 | |
|         MetadataExtracterLimits limits = null;
 | |
|         limits = mimetypeLimits.get(mimetype);
 | |
|         if (limits == null)
 | |
|         {
 | |
|             limits = mimetypeLimits.get("*");
 | |
|         }
 | |
|         return limits;
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * <code>Callable</code> wrapper for the 
 | |
|      * {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader)} method
 | |
|      * to handle timeouts.
 | |
|      */
 | |
|     private class ExtractRawCallable implements Callable<Map<String,Serializable>>
 | |
|     {
 | |
|         private ContentReader contentReader;
 | |
|         
 | |
|         public ExtractRawCallable(ContentReader reader)
 | |
|         {
 | |
|             this.contentReader = reader;
 | |
|         }
 | |
|         
 | |
|         @Override
 | |
|         public Map<String, Serializable> call() throws Exception
 | |
|         {
 | |
|             try
 | |
|             {
 | |
|                 return extractRaw(contentReader);
 | |
|             }
 | |
|             catch (Throwable e)
 | |
|             {
 | |
|                 throw new ExtractRawCallableException(e);
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * Exception wrapper to handle any {@link Throwable} from 
 | |
|      * {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader)}
 | |
|      */
 | |
|     private class ExtractRawCallableException extends Exception
 | |
|     {
 | |
|         private static final long serialVersionUID = 1813857091767321624L;
 | |
|         public ExtractRawCallableException(Throwable cause)
 | |
|         {
 | |
|             super(cause);
 | |
|         }
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * Calls the {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader)} method
 | |
|      * using the given limits.
 | |
|      * <p>
 | |
|      * Currently the only limit supported by {@link MetadataExtracterLimits} is a timeout
 | |
|      * so this method uses {@link AbstractMappingMetadataExtracter#getExecutorService()}
 | |
|      * to execute a {@link FutureTask} with any timeout defined.
 | |
|      * <p>
 | |
|      * If no timeout limit is defined or is unlimited (-1),
 | |
|      * the <code>extractRaw</code> method is called directly.
 | |
|      * 
 | |
|      * @param reader        the document to extract the values from.  This stream provided by
 | |
|      *                      the reader must be closed if accessed directly.
 | |
|      * @param limits        the limits to impose on the extraction
 | |
|      * @return              Returns a map of document property values keyed by property name.
 | |
|      * @throws Throwable    All exception conditions can be handled.
 | |
|      */
 | |
|     private Map<String, Serializable> extractRaw(
 | |
|             ContentReader reader, MetadataExtracterLimits limits) throws Throwable
 | |
|     {
 | |
|         if (limits == null || limits.getTimeoutMs() == -1)
 | |
|         {
 | |
|             return extractRaw(reader);
 | |
|         }
 | |
|         FutureTask<Map<String, Serializable>> task = null;
 | |
|         StreamAwareContentReaderProxy proxiedReader = null;
 | |
|         try
 | |
|         {
 | |
|             proxiedReader = new StreamAwareContentReaderProxy(reader);
 | |
|             task = new FutureTask<Map<String,Serializable>>(new ExtractRawCallable(proxiedReader));
 | |
|             getExecutorService().execute(task);
 | |
|             return task.get(limits.getTimeoutMs(), TimeUnit.MILLISECONDS);
 | |
|         }
 | |
|         catch (TimeoutException e)
 | |
|         {
 | |
|             task.cancel(true);
 | |
|             if (null != proxiedReader)
 | |
|             {
 | |
|                 proxiedReader.release();
 | |
|             }
 | |
|             throw e;
 | |
|         }
 | |
|         catch (InterruptedException e)
 | |
|         {
 | |
|             // We were asked to stop
 | |
|             task.cancel(true);
 | |
|             return null;
 | |
|         }
 | |
|         catch (ExecutionException e)
 | |
|         {
 | |
|             // Unwrap our cause and throw that
 | |
|             Throwable cause = e.getCause();
 | |
|             if (cause != null && cause instanceof ExtractRawCallableException)
 | |
|             {
 | |
|                 cause = ((ExtractRawCallableException) cause).getCause();
 | |
|             }
 | |
|             throw cause;
 | |
|         }
 | |
|     }
 | |
|     
 | |
|     /**
 | |
|      * Override to provide the raw extracted metadata values.  An extracter should extract
 | |
|      * as many of the available properties as is realistically possible.  Even if the
 | |
|      * {@link #getDefaultMapping() default mapping} doesn't handle all properties, it is
 | |
|      * possible for each instance of the extracter to be configured differently and more or
 | |
|      * less of the properties may be used in different installations.
 | |
|      * <p>
 | |
|      * Raw values must not be trimmed or removed for any reason.  Null values and empty
 | |
|      * strings are 
 | |
|      * <ul>
 | |
|      *    <li><b>Null:</b>              Removed</li>
 | |
|      *    <li><b>Empty String:</b>      Passed to the OverwritePolicy</li>
 | |
|      *    <li><b>Non Serializable:</b>  Converted to String or fails if that is not possible</li>
 | |
|      * </ul>
 | |
|      * <p>
 | |
|      * Properties extracted and their meanings and types should be thoroughly described in
 | |
|      * the class-level javadocs of the extracter implementation, for example:
 | |
|      * <pre>
 | |
|      * <b>editor:</b> - the document editor        -->  cm:author
 | |
|      * <b>title:</b>  - the document title         -->  cm:title
 | |
|      * <b>user1:</b>  - the document summary
 | |
|      * <b>user2:</b>  - the document description   -->  cm:description
 | |
|      * <b>user3:</b>  -
 | |
|      * <b>user4:</b>  -
 | |
|      * </pre>
 | |
|      * 
 | |
|      * @param reader        the document to extract the values from.  This stream provided by
 | |
|      *                      the reader must be closed if accessed directly.
 | |
|      * @return              Returns a map of document property values keyed by property name.
 | |
|      * @throws Throwable    All exception conditions can be handled.
 | |
|      * 
 | |
|      * @see #getDefaultMapping()
 | |
|      */
 | |
|     protected abstract Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable;
 | |
| 
 | |
|     /**
 | |
|      * Override to embed metadata values.  An extracter should embed
 | |
|      * as many of the available properties as is realistically possible.  Even if the
 | |
|      * {@link #getDefaultEmbedMapping() default mapping} doesn't handle all properties, it is
 | |
|      * possible for each instance of the extracter to be configured differently and more or
 | |
|      * less of the properties may be used in different installations.
 | |
|      *
 | |
|      * @param metadata		the metadata keys and values to embed in the content file
 | |
|      * @param reader		the reader for the original document.  This stream provided by
 | |
|      *                      the reader must be closed if accessed directly.
 | |
|      * @param writer        the writer for the document to embed the values in.  This stream provided by
 | |
|      *                      the writer must be closed if accessed directly.
 | |
|      * @throws Throwable    All exception conditions can be handled.
 | |
|      *
 | |
|      * @see #getDefaultEmbedMapping()
 | |
|      */
 | |
|     protected void embedInternal(Map<String, Serializable> metadata, ContentReader reader, ContentWriter writer) throws Throwable
 | |
|     {
 | |
|         // TODO make this an abstract method once more extracters support embedding
 | |
|     }
 | |
| }
 |