/* * Copyright (C) 2005-2012 Alfresco Software Limited. * * This file is part of Alfresco * * Alfresco is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Alfresco is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Alfresco. If not, see . */ package org.alfresco.repo.content.metadata; import java.io.InputStream; import java.io.Serializable; import java.lang.reflect.Array; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.StringTokenizer; import org.alfresco.error.AlfrescoRuntimeException; import org.alfresco.model.ContentModel; import org.alfresco.service.cmr.dictionary.DataTypeDefinition; import org.alfresco.service.cmr.dictionary.DictionaryService; import org.alfresco.service.cmr.dictionary.PropertyDefinition; import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.MalformedNodeRefException; import org.alfresco.service.cmr.repository.MimetypeService; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.cmr.repository.datatype.TypeConversionException; import org.alfresco.service.namespace.InvalidQNameException; import org.alfresco.service.namespace.QName; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.xmlbeans.impl.xb.xsdschema.All; import org.springframework.beans.factory.BeanNameAware; import org.springframework.extensions.surf.util.ISO8601DateFormat; /** * Support class for metadata extracters that support dynamic and config-driven * mapping between extracted values and model properties. Extraction is broken * up into two phases: * *

* Migrating an existing extracter to use this class is straightforward: *

* * @see #getDefaultMapping() * @see #extractRaw(ContentReader) * @see #setMapping(Map) * * @since 2.1 * * @author Jesper Steen Møller * @author Derek Hulley */ abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter, MetadataEmbedder, BeanNameAware { public static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix."; private static final String ERR_TYPE_CONVERSION = "metadata.extraction.err.type_conversion"; protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class); private MetadataExtracterRegistry registry; private MimetypeService mimetypeService; private DictionaryService dictionaryService; private boolean initialized; private Set supportedMimetypes; private Set supportedEmbedMimetypes; private OverwritePolicy overwritePolicy; private boolean failOnTypeConversion; protected Set supportedDateFormats = new HashSet(0); private Map> mapping; private Map> embedMapping; private boolean inheritDefaultMapping; private boolean inheritDefaultEmbedMapping; private boolean enableStringTagging; private String beanName; private Properties properties; /** * Default constructor. If this is called, then {@link #isSupported(String)} should * be implemented. This is useful when the list of supported mimetypes is not known * when the instance is constructed. Alternatively, once the set becomes known, call * {@link #setSupportedMimetypes(Collection)}. * * @see #isSupported(String) * @see #setSupportedMimetypes(Collection) */ protected AbstractMappingMetadataExtracter() { this(Collections.emptySet()); } /** * Constructor that can be used when the list of supported mimetypes is known up front. * * @param supportedMimetypes the set of mimetypes supported by default */ protected AbstractMappingMetadataExtracter(Set supportedMimetypes) { this.supportedMimetypes = supportedMimetypes; // Set defaults overwritePolicy = OverwritePolicy.PRAGMATIC; failOnTypeConversion = true; mapping = null; // The default will be fetched embedMapping = null; inheritDefaultMapping = false; // Any overrides are complete inheritDefaultEmbedMapping = false; initialized = false; } /** * Constructor that can be used when the list of supported extract and embed mimetypes is known up front. * * @param supportedMimetypes the set of mimetypes supported for extraction by default * @param supportedEmbedMimetypes the set of mimetypes supported for embedding by default */ protected AbstractMappingMetadataExtracter(Set supportedMimetypes, Set supportedEmbedMimetypes) { this(supportedMimetypes); this.supportedEmbedMimetypes = supportedEmbedMimetypes; } /** * Set the registry to register with. If this is not set, then the default * initialization will not auto-register the extracter for general use. It * can still be used directly. * * @param registry a metadata extracter registry */ public void setRegistry(MetadataExtracterRegistry registry) { this.registry = registry; } /** * @param mimetypeService the mimetype service. Set this if required. */ public void setMimetypeService(MimetypeService mimetypeService) { this.mimetypeService = mimetypeService; } /** * @return Returns the mimetype helper */ protected MimetypeService getMimetypeService() { return mimetypeService; } /** * @param dictionaryService the dictionary service to determine which data conversions are necessary */ public void setDictionaryService(DictionaryService dictionaryService) { this.dictionaryService = dictionaryService; } /** * Set the mimetypes that are supported by the extracter. * * @param supportedMimetypes */ public void setSupportedMimetypes(Collection supportedMimetypes) { this.supportedMimetypes.clear(); this.supportedMimetypes.addAll(supportedMimetypes); } /** * Set the mimetypes that are supported for embedding. * * @param supportedEmbedMimetypes */ public void setSupportedEmbedMimetypes(Collection supportedEmbedMimetypes) { this.supportedEmbedMimetypes.clear(); this.supportedEmbedMimetypes.addAll(supportedEmbedMimetypes); } /** * {@inheritDoc} * * @see #setSupportedMimetypes(Collection) */ public boolean isSupported(String sourceMimetype) { return supportedMimetypes.contains(sourceMimetype) && isEnabled(sourceMimetype); } /** * {@inheritDoc} * * @see #setSupportedEmbedMimetypes(Collection) */ public boolean isEmbeddingSupported(String sourceMimetype) { if (supportedEmbedMimetypes == null) { return false; } return supportedEmbedMimetypes.contains(sourceMimetype); } private boolean isEnabled(String mimetype) { return properties == null || mimetypeService == null || (getBooleanProperty(beanName+".enabled", true) && getBooleanProperty(beanName+'.'+mimetypeService.getExtension(mimetype)+".enabled", true)); } private boolean getBooleanProperty(String name, boolean defaultValue) { boolean value = defaultValue; if (properties != null) { String property = properties.getProperty(name); if (property != null) { value = property.trim().equalsIgnoreCase("true"); } } return value; } /** * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced * @return Returns 1.0 if the mimetype is supported, otherwise 0.0 * * @see #isSupported(String) */ public double getReliability(String mimetype) { return isSupported(mimetype) ? 1.0D : 0.0D; } /** * Set the policy to use when existing values are encountered. Depending on how the extractor * is called, this may not be relevant, i.e an empty map of existing properties may be passed * in by the client code, which may follow its own overwrite strategy. * * @param overwritePolicy the policy to apply when there are existing system properties */ public void setOverwritePolicy(OverwritePolicy overwritePolicy) { this.overwritePolicy = overwritePolicy; } /** * Set the policy to use when existing values are encountered. Depending on how the extractor * is called, this may not be relevant, i.e an empty map of existing properties may be passed * in by the client code, which may follow its own overwrite strategy. * * @param overwritePolicyStr the policy to apply when there are existing system properties */ public void setOverwritePolicy(String overwritePolicyStr) { this.overwritePolicy = OverwritePolicy.valueOf(overwritePolicyStr); } /** * Set whether the extractor should discard metadata that fails to convert to the target type * defined in the data dictionary model. This is true by default i.e. if the data * extracted is not compatible with the target model then the extraction will fail. If this is * false then any extracted data that fails to convert will be discarded. * * @param failOnTypeConversion false to discard properties that can't get converted * to the dictionary-defined type, or true (default) * to fail the extraction if the type doesn't convert */ public void setFailOnTypeConversion(boolean failOnTypeConversion) { this.failOnTypeConversion = failOnTypeConversion; } /** * Set the date formats, over and above the {@link ISO8601DateFormat ISO8601 format}, that will * be supported for string to date conversions. The supported syntax is described by the * {@link http://java.sun.com/j2se/1.5.0/docs/api/java/text/SimpleDateFormat.html SimpleDateFormat Javadocs}. * * @param supportedDateFormats a list of supported date formats. */ public void setSupportedDateFormats(List supportedDateFormats) { this.supportedDateFormats = new HashSet(5); for (String dateFormatStr : supportedDateFormats) { try { /** * Regional date format */ DateFormat df = new SimpleDateFormat(dateFormatStr); this.supportedDateFormats.add(df); /** * */ /** * Date format can be locale specific - make sure English format always works */ /* * TODO MER 25 May 2010 - Added this as a quick fix for IMAP date parsing which is always * English regardless of Locale. Some more thought and/or code is required to configure * the relationship between properties, format and locale. */ DateFormat englishFormat = new SimpleDateFormat(dateFormatStr, Locale.US); this.supportedDateFormats.add(englishFormat); } catch (Throwable e) { // No good throw new AlfrescoRuntimeException("Unable to set supported date format: " + dateFormatStr, e); } } } /** * Set if the property mappings augment or override the mapping generically provided by the * extracter implementation. The default is false, i.e. any mapping set completely * replaces the {@link #getDefaultMapping() default mappings}. * * @param inheritDefaultMapping true to add the configured mapping * to the list of default mappings. * * @see #getDefaultMapping() * @see #setMapping(Map) * @see #setMappingProperties(Properties) */ public void setInheritDefaultMapping(boolean inheritDefaultMapping) { this.inheritDefaultMapping = inheritDefaultMapping; } @Override public void setBeanName(String beanName) { this.beanName = beanName; } public String getBeanName() { return beanName; } /** * The Alfresco global properties. */ public void setProperties(Properties properties) { this.properties = properties; } /** * Whether or not to enable the pass through of simple strings to cm:taggable tags * * @param enableStringTagging true find or create tags for each string * mapped to cm:taggable. false (default) * ignore mapping strings to tags. */ public void setEnableStringTagging(boolean enableStringTagging) { this.enableStringTagging = enableStringTagging; } /** * Set if the embed property mappings augment or override the mapping generically provided by the * extracter implementation. The default is false, i.e. any mapping set completely * replaces the {@link #getDefaultEmbedMapping() default mappings}. * * @param inheritDefaultEmbedMapping true to add the configured embed mapping * to the list of default embed mappings. * * @see #getDefaultEmbedMapping() * @see #setEmbedMapping(Map) * @see #setEmbedMappingProperties(Properties) */ public void setInheritDefaultEmbedMapping(boolean inheritDefaultEmbedMapping) { this.inheritDefaultEmbedMapping = inheritDefaultEmbedMapping; } /** * Set the mapping from document metadata to system metadata. It is possible to direct * an extracted document property to several system properties. The conversion between * the document property types and the system property types will be done by the * {@link org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter default converter}. * * @param mapping a mapping from document metadata to system metadata */ public void setMapping(Map> mapping) { this.mapping = mapping; } /** * Set the embed mapping from document metadata to system metadata. It is possible to direct * an model properties to several content file metadata keys. The conversion between * the model property types and the content file metadata keys types will be done by the * {@link org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter default converter}. * * @param embedMapping an embed mapping from model properties to content file metadata keys */ public void setEmbedMapping(Map> embedMapping) { this.embedMapping = embedMapping; } /** * Set the properties that contain the mapping from document metadata to system metadata. * This is an alternative to the {@link #setMapping(Map)} method. Any mappings already * present will be cleared out. * * The property mapping is of the form: *
     * # Namespaces prefixes
     * namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
     * namespace.prefix.my=http://www....com/alfresco/1.0
     * 
     * # Mapping
     * editor=cm:author, my:editor
     * title=cm:title
     * user1=cm:summary
     * user2=cm:description
     * 
* The mapping can therefore be from a single document property onto several system properties. * * @param mappingProperties the properties that map document properties to system properties */ public void setMappingProperties(Properties mappingProperties) { mapping = readMappingProperties(mappingProperties); } /** * Set the properties that contain the embed mapping from model properties to content file metadata. * This is an alternative to the {@link #setEmbedMapping(Map)} method. Any mappings already * present will be cleared out. * * The property mapping is of the form: *
     * # Namespaces prefixes
     * namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
     * namespace.prefix.my=http://www....com/alfresco/1.0
     *
     * # Mapping
     * cm\:author=editor
     * cm\:title=title
     * cm\:summary=user1
     * cm\:description=description,user2
     * 
* The embed mapping can therefore be from a model property onto several content file metadata properties. * * @param embedMappingProperties the properties that map model properties to content file metadata properties */ public void setEmbedMappingProperties(Properties embedMappingProperties) { embedMapping = readEmbedMappingProperties(embedMappingProperties); } /** * Helper method for derived classes to obtain the mappings that will be applied to raw * values. This should be called after initialization in order to guarantee the complete * map is given. *

* Normally, the list of properties that can be extracted from a document is fixed and * well-known - in that case, just extract everything. But Some implementations may have * an extra, indeterminate set of values available for extraction. If the extraction of * these runtime parameters is expensive, then the keys provided by the return value can * be used to extract values from the documents. The metadata extraction becomes fully * configuration-driven, i.e. declaring further mappings will result in more values being * extracted from the documents. *

* Most extractors will not be using this method. For an example of its use, see the * {@linkplain OpenDocumentMetadataExtracter OpenDocument extractor}, which uses the mapping * to select specific user properties from a document. */ protected final Map> getMapping() { if (!initialized) { throw new UnsupportedOperationException("The complete mapping is only available after initialization."); } return Collections.unmodifiableMap(mapping); } /** * Helper method for derived classes to obtain the embed mappings. * This should be called after initialization in order to guarantee the complete * map is given. *

* Normally, the list of properties that can be embedded in a document is fixed and * well-known.. But some implementations may have * an extra, indeterminate set of values available for embedding. If the embedding of * these runtime parameters is expensive, then the keys provided by the return value can * be used to embed values in the documents. The metadata embedding becomes fully * configuration-driven, i.e. declaring further mappings will result in more values being * embedded in the documents. */ protected final Map> getEmbedMapping() { if (!initialized) { throw new UnsupportedOperationException("The complete embed mapping is only available after initialization."); } return Collections.unmodifiableMap(embedMapping); } /** * A utility method to read mapping properties from a resource file and convert to the map form. * * @param propertiesUrl A standard Properties file URL location * * @see #setMappingProperties(Properties) */ protected Map> readMappingProperties(String propertiesUrl) { InputStream is = null; try { is = getClass().getClassLoader().getResourceAsStream(propertiesUrl); if(is == null) { throw new AlfrescoRuntimeException( "Metadata Extracter mapping properties not found: \n" + " Extracter: " + this + "\n" + " Bundle: " + propertiesUrl); } Properties props = new Properties(); props.load(is); // Process it Map> map = readMappingProperties(props); // Done if (logger.isDebugEnabled()) { logger.debug("Loaded mapping properties from resource: " + propertiesUrl); } return map; } catch (Throwable e) { throw new AlfrescoRuntimeException( "Unable to load properties file to read extracter mapping properties: \n" + " Extracter: " + this + "\n" + " Bundle: " + propertiesUrl, e); } finally { if (is != null) { try { is.close(); } catch (Throwable e) {} } } } /** * A utility method to convert mapping properties to the Map form. * * @see #setMappingProperties(Properties) */ protected Map> readMappingProperties(Properties mappingProperties) { Map namespacesByPrefix = new HashMap(5); // Get the namespaces for (Map.Entry entry : mappingProperties.entrySet()) { String propertyName = (String) entry.getKey(); if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX)) { String prefix = propertyName.substring(17); String namespace = (String) entry.getValue(); namespacesByPrefix.put(prefix, namespace); } } // Create the mapping Map> convertedMapping = new HashMap>(17); for (Map.Entry entry : mappingProperties.entrySet()) { String documentProperty = (String) entry.getKey(); String qnamesStr = (String) entry.getValue(); if (documentProperty.startsWith(NAMESPACE_PROPERTY_PREFIX)) { // Ignore these now continue; } // Create the entry Set qnames = new HashSet(3); convertedMapping.put(documentProperty, qnames); // The to value can be a list of QNames StringTokenizer tokenizer = new StringTokenizer(qnamesStr, ","); while (tokenizer.hasMoreTokens()) { String qnameStr = tokenizer.nextToken().trim(); // Check if we need to resolve a namespace reference int index = qnameStr.indexOf(QName.NAMESPACE_PREFIX); if (index > -1 && qnameStr.charAt(0) != QName.NAMESPACE_BEGIN) { String prefix = qnameStr.substring(0, index); String suffix = qnameStr.substring(index + 1); // It is prefixed String uri = namespacesByPrefix.get(prefix); if (uri == null) { throw new AlfrescoRuntimeException( "No prefix mapping for extracter property mapping: \n" + " Extracter: " + this + "\n" + " Mapping: " + entry); } qnameStr = QName.NAMESPACE_BEGIN + uri + QName.NAMESPACE_END + suffix; } try { QName qname = QName.createQName(qnameStr); // Add it to the mapping qnames.add(qname); } catch (InvalidQNameException e) { throw new AlfrescoRuntimeException( "Can't create metadata extracter property mapping: \n" + " Extracter: " + this + "\n" + " Mapping: " + entry); } } if (logger.isTraceEnabled()) { logger.trace("Added mapping from " + documentProperty + " to " + qnames); } } // Done return convertedMapping; } /** * A utility method to read embed mapping properties from a resource file and convert to the map form. * * @param propertiesUrl A standard Properties file URL location * * @see #setEmbedMappingProperties(Properties) */ protected Map> readEmbedMappingProperties(String propertiesUrl) { InputStream is = null; try { is = getClass().getClassLoader().getResourceAsStream(propertiesUrl); if(is == null) { return null; } Properties props = new Properties(); props.load(is); // Process it Map> map = readEmbedMappingProperties(props); // Done if (logger.isDebugEnabled()) { logger.debug("Loaded embed mapping properties from resource: " + propertiesUrl); } return map; } catch (Throwable e) { throw new AlfrescoRuntimeException( "Unable to load properties file to read extracter embed mapping properties: \n" + " Extracter: " + this + "\n" + " Bundle: " + propertiesUrl, e); } finally { if (is != null) { try { is.close(); } catch (Throwable e) {} } } } /** * A utility method to convert mapping properties to the Map form. *

* Different from readMappingProperties in that keys are the Alfresco QNames * and values are file metadata properties. * * @see #setMappingProperties(Properties) */ protected Map> readEmbedMappingProperties(Properties mappingProperties) { Map namespacesByPrefix = new HashMap(5); // Get the namespaces for (Map.Entry entry : mappingProperties.entrySet()) { String propertyName = (String) entry.getKey(); if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX)) { String prefix = propertyName.substring(17); String namespace = (String) entry.getValue(); namespacesByPrefix.put(prefix, namespace); } } // Create the mapping Map> convertedMapping = new HashMap>(17); for (Map.Entry entry : mappingProperties.entrySet()) { String modelProperty = (String) entry.getKey(); String metadataKeysString = (String) entry.getValue(); if (modelProperty.startsWith(NAMESPACE_PROPERTY_PREFIX)) { // Ignore these now continue; } int index = modelProperty.indexOf(QName.NAMESPACE_PREFIX); if (index > -1 && modelProperty.charAt(0) != QName.NAMESPACE_BEGIN) { String prefix = modelProperty.substring(0, index); String suffix = modelProperty.substring(index + 1); // It is prefixed String uri = namespacesByPrefix.get(prefix); if (uri == null) { throw new AlfrescoRuntimeException( "No prefix mapping for embed property mapping: \n" + " Extracter: " + this + "\n" + " Mapping: " + entry); } modelProperty = QName.NAMESPACE_BEGIN + uri + QName.NAMESPACE_END + suffix; } try { QName qname = QName.createQName(modelProperty); String[] metadataKeysArray = metadataKeysString.split(","); Set metadataKeys = new HashSet(metadataKeysArray.length); for (String metadataKey : metadataKeysArray) { metadataKeys.add(metadataKey.trim()); } // Create the entry convertedMapping.put(qname, metadataKeys); } catch (InvalidQNameException e) { throw new AlfrescoRuntimeException( "Can't create metadata embedding property mapping: \n" + " Extracter: " + this + "\n" + " Mapping: " + entry); } if (logger.isTraceEnabled()) { logger.trace("Added mapping from " + modelProperty + " to " + metadataKeysString); } } // Done return convertedMapping; } /** * Registers this instance of the extracter with the registry. This will call the * {@link #init()} method and then register if the registry is available. * * @see #setRegistry(MetadataExtracterRegistry) * @see #init() */ public final void register() { init(); // Register the extracter, if necessary if (registry != null) { registry.register(this); } } /** * Provides a hook point for implementations to perform initialization. The base * implementation must be invoked or the extracter will fail during extraction. * The {@link #getDefaultMapping() default mappings} will be requested during * initialization. */ protected void init() { Map> defaultMapping = getDefaultMapping(); if (defaultMapping == null) { throw new AlfrescoRuntimeException("The metadata extracter must provide a default mapping: " + this); } // Was a mapping explicitly provided if (mapping == null) { // No mapping, so use the default mapping = defaultMapping; } else if (inheritDefaultMapping) { // Merge the default mapping into the configured mapping for (String documentKey : defaultMapping.keySet()) { Set systemQNames = mapping.get(documentKey); if (systemQNames == null) { systemQNames = new HashSet(3); mapping.put(documentKey, systemQNames); } Set defaultQNames = defaultMapping.get(documentKey); systemQNames.addAll(defaultQNames); } } // The configured mappings are empty, but there were default mappings if (mapping.size() == 0 && defaultMapping.size() > 0) { logger.warn( "There are no property mappings for the metadata extracter.\n" + " Nothing will be extracted by: " + this); } Map> defaultEmbedMapping = getDefaultEmbedMapping(); // Was a mapping explicitly provided if (embedMapping == null) { // No mapping, so use the default embedMapping = defaultEmbedMapping; } else if (inheritDefaultEmbedMapping) { // Merge the default mapping into the configured mapping for (QName modelProperty : defaultEmbedMapping.keySet()) { Set metadataKeys = embedMapping.get(modelProperty); if (metadataKeys == null) { metadataKeys = new HashSet(3); embedMapping.put(modelProperty, metadataKeys); } Set defaultMetadataKeys = defaultEmbedMapping.get(modelProperty); metadataKeys.addAll(defaultMetadataKeys); } } // Done initialized = true; } /** {@inheritDoc} */ public long getExtractionTime() { return 1000L; } /** * Checks if the mimetype is supported. * * @param reader the reader to check * @throws AlfrescoRuntimeException if the mimetype is not supported */ protected void checkIsSupported(ContentReader reader) { String mimetype = reader.getMimetype(); if (!isSupported(mimetype)) { throw new AlfrescoRuntimeException( "Metadata extracter does not support mimetype: \n" + " reader: " + reader + "\n" + " supported: " + supportedMimetypes + "\n" + " extracter: " + this); } } /** * Checks if embedding for the mimetype is supported. * * @param writer the writer to check * @throws AlfrescoRuntimeException if embedding for the mimetype is not supported */ protected void checkIsEmbedSupported(ContentWriter writer) { String mimetype = writer.getMimetype(); if (!isSupported(mimetype)) { throw new AlfrescoRuntimeException( "Metadata extracter does not support embedding mimetype: \n" + " writer: " + writer + "\n" + " supported: " + supportedEmbedMimetypes + "\n" + " extracter: " + this); } } /** * {@inheritDoc} */ public final Map extract(ContentReader reader, Map destination) { return extract(reader, this.overwritePolicy, destination, this.mapping); } /** * {@inheritDoc} */ public final Map extract( ContentReader reader, OverwritePolicy overwritePolicy, Map destination) { return extract(reader, overwritePolicy, destination, this.mapping); } /** * {@inheritDoc} */ public Map extract( ContentReader reader, OverwritePolicy overwritePolicy, Map destination, Map> mapping) { // Done if (logger.isDebugEnabled()) { logger.debug("Starting metadata extraction: \n" + " reader: " + reader + "\n" + " extracter: " + this); } if (!initialized) { throw new AlfrescoRuntimeException( "Metadata extracter not initialized.\n" + " Call the 'register' method on: " + this + "\n" + " Implementations of the 'init' method must call the base implementation."); } // check the reliability checkIsSupported(reader); Map changedProperties = null; try { Map rawMetadata = null; // Check that the content has some meat if (reader.getSize() > 0 && reader.exists()) { rawMetadata = extractRaw(reader); } else { rawMetadata = new HashMap(1); } // Convert to system properties (standalone) Map systemProperties = mapRawToSystem(rawMetadata); // Convert the properties according to the dictionary types systemProperties = convertSystemPropertyValues(systemProperties); // Last chance to filter the system properties map before applying them filterSystemProperties(systemProperties, destination); // Now use the proper overwrite policy changedProperties = overwritePolicy.applyProperties(systemProperties, destination); if(logger.isDebugEnabled()) { logger.debug("Extracted Metadata from " + reader + "\n Found: " + rawMetadata + "\n Mapped and Accepted: " + changedProperties); } } catch (Throwable e) { // Ask Tika to detect the document, and report back on if // the current mime type is plausible String typeErrorMessage = null; String differentType = null; if(mimetypeService != null) { differentType = mimetypeService.getMimetypeIfNotMatches(reader.getReader()); } else { logger.info("Unable to verify mimetype of " + reader.getReader() + " as no MimetypeService available to " + getClass().getName()); } if(differentType != null) { typeErrorMessage = "\n" + " claimed mime type: " + reader.getMimetype() + "\n" + " detected mime type: " + differentType; } if (logger.isDebugEnabled()) { logger.debug( "Metadata extraction failed: \n" + " Extracter: " + this + "\n" + " Content: " + reader + typeErrorMessage, e); } else { logger.warn( "Metadata extraction failed (turn on DEBUG for full error): \n" + " Extracter: " + this + "\n" + " Content: " + reader + "\n" + " Failure: " + e.getMessage() + typeErrorMessage); } } finally { // check that the reader was closed (if used) if (reader.isChannelOpen()) { logger.error("Content reader not closed by metadata extracter: \n" + " reader: " + reader + "\n" + " extracter: " + this); } // Make sure that we have something to return if (changedProperties == null) { changedProperties = new HashMap(0); } } // Done if (logger.isDebugEnabled()) { logger.debug("Completed metadata extraction: \n" + " reader: " + reader + "\n" + " extracter: " + this + "\n" + " changed: " + changedProperties); } return changedProperties; } /** * {@inheritDoc} */ public final void embed( Map properties, ContentReader reader, ContentWriter writer) { // Done if (logger.isDebugEnabled()) { logger.debug("Starting metadata embedding: \n" + " reader: " + reader + "\n" + " writer: " + writer + "\n" + " extracter: " + this); } if (!initialized) { throw new AlfrescoRuntimeException( "Metadata extracter not initialized.\n" + " Call the 'register' method on: " + this + "\n" + " Implementations of the 'init' method must call the base implementation."); } // check the reliability checkIsEmbedSupported(writer); try { embedInternal(mapSystemToRaw(properties), reader, writer); if(logger.isDebugEnabled()) { logger.debug("Embedded Metadata into " + writer); } } catch (Throwable e) { // Ask Tika to detect the document, and report back on if // the current mime type is plausible String typeErrorMessage = ""; String differentType = null; if(mimetypeService != null) { try { differentType = mimetypeService.getMimetypeIfNotMatches(writer.getReader()); } catch (ContentIOException cioe) { // Embedding failed and writer is empty } } else { logger.info("Unable to verify mimetype of " + writer.getReader() + " as no MimetypeService available to " + getClass().getName()); } if(differentType != null) { typeErrorMessage = "\n" + " claimed mime type: " + writer.getMimetype() + "\n" + " detected mime type: " + differentType; } if (logger.isDebugEnabled()) { logger.debug( "Metadata embedding failed: \n" + " Extracter: " + this + "\n" + " Content: " + writer + typeErrorMessage, e); } else { logger.error( "Metadata embedding failed (turn on DEBUG for full error): \n" + " Extracter: " + this + "\n" + " Content: " + writer + "\n" + " Failure: " + e.getMessage() + typeErrorMessage); } } finally { // check that the writer was closed (if used) if (writer.isChannelOpen()) { logger.error("Content writer not closed by metadata extracter: \n" + " writer: " + writer + "\n" + " extracter: " + this); } } // Done if (logger.isDebugEnabled()) { logger.debug("Completed metadata embedding: \n" + " writer: " + writer + "\n" + " extracter: " + this); } } /** * * @param rawMetadata Metadata keyed by document properties * @return Returns the metadata keyed by the system properties */ private Map mapRawToSystem(Map rawMetadata) { Map systemProperties = new HashMap(rawMetadata.size() * 2 + 1); for (Map.Entry entry : rawMetadata.entrySet()) { String documentKey = entry.getKey(); // Check if there is a mapping for this if (!mapping.containsKey(documentKey)) { // No mapping - ignore continue; } Serializable documentValue = entry.getValue(); Set systemQNames = mapping.get(documentKey); for (QName systemQName : systemQNames) { systemProperties.put(systemQName, documentValue); } } // Done if (logger.isDebugEnabled()) { logger.debug( "Converted extracted raw values to system values: \n" + " Raw Properties: " + rawMetadata + "\n" + " System Properties: " + systemProperties); } return systemProperties; } /** * * @param systemMetadata Metadata keyed by system properties * @return Returns the metadata keyed by the content file metadata properties */ private Map mapSystemToRaw(Map systemMetadata) { Map metadataProperties = new HashMap(systemMetadata.size() * 2 + 1); for (Map.Entry entry : systemMetadata.entrySet()) { QName modelProperty = entry.getKey(); // Check if there is a mapping for this if (!embedMapping.containsKey(modelProperty)) { // No mapping - ignore continue; } Serializable documentValue = entry.getValue(); Set metadataKeys = embedMapping.get(modelProperty); for (String metadataKey : metadataKeys) { metadataProperties.put(metadataKey, documentValue); } } // Done if (logger.isDebugEnabled()) { logger.debug( "Converted system model values to metadata values: \n" + " System Properties: " + systemMetadata + "\n" + " Metadata Properties: " + metadataProperties); } return metadataProperties; } /** * Filters the system properties that are going to be applied. Gives the metadata extracter an * opportunity to remove properties that may not be appropriate in a given context. * * @param systemProperties map of system properties to be applied * @param targetProperties map of target properties, may be used to provide to the context requried */ protected void filterSystemProperties(Map systemProperties, Map targetProperties) { // Default implementation does nothing } /** * Converts all values according to their dictionary-defined type. This uses the * {@link #setFailOnTypeConversion(boolean) failOnTypeConversion flag} to determine how failures * are handled i.e. if values fail to convert, the process may discard the property. * * @param systemProperties the values keyed to system property names * @return Returns a modified map of properties that have been converted. */ @SuppressWarnings("unchecked") private Map convertSystemPropertyValues(Map systemProperties) { Map convertedProperties = new HashMap(systemProperties.size() + 7); for (Map.Entry entry : systemProperties.entrySet()) { QName propertyQName = entry.getKey(); Serializable propertyValue = entry.getValue(); // Get the property definition PropertyDefinition propertyDef = (dictionaryService == null) ? null : dictionaryService.getProperty(propertyQName); if (propertyDef == null) { // There is nothing in the DD about this so just transfer it convertedProperties.put(propertyQName, propertyValue); continue; } // It is in the DD, so attempt the conversion DataTypeDefinition propertyTypeDef = propertyDef.getDataType(); Serializable convertedPropertyValue = null; try { // Attempt to make any date conversions if (propertyTypeDef.getName().equals(DataTypeDefinition.DATE) || propertyTypeDef.getName().equals(DataTypeDefinition.DATETIME)) { if (propertyValue instanceof Date) { convertedPropertyValue = propertyValue; } else if (propertyValue instanceof Collection) { convertedPropertyValue = (Serializable) makeDates((Collection) propertyValue); } else if (propertyValue instanceof String) { convertedPropertyValue = makeDate((String) propertyValue); } else if (propertyValue == null) { convertedPropertyValue = null; } else { if (logger.isWarnEnabled()) { StringBuilder mesg = new StringBuilder(); mesg.append("Unable to convert Date property: ").append(propertyQName) .append(", value: ").append(propertyValue).append(", type: ").append(propertyTypeDef.getName()); logger.warn(mesg.toString()); } } } else { if (propertyValue instanceof Collection) { convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert( propertyTypeDef, (Collection) propertyValue); } else if (propertyValue instanceof Object[]) { convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert( propertyTypeDef, (Object[]) propertyValue); } else { convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert( propertyTypeDef, propertyValue); } } convertedProperties.put(propertyQName, convertedPropertyValue); } catch (TypeConversionException e) { logger.warn( "Type conversion failed during metadata extraction: \n" + " Failure: " + e.getMessage() + "\n" + " Type: " + propertyTypeDef + "\n" + " Value: " + propertyValue); // Do we just absorb this or is it a problem? if (failOnTypeConversion) { throw AlfrescoRuntimeException.create( e, ERR_TYPE_CONVERSION, this, propertyQName, propertyTypeDef.getName(), propertyValue); } } catch (MalformedNodeRefException e) { if (propertyQName.equals(ContentModel.PROP_TAGS)) { if (enableStringTagging) { // We must want to map tag string values instead of nodeRefs // ContentMetadataExtracter will take care of tagging by string ArrayList list = new ArrayList(1); for (Object value : (Object[]) propertyValue) { list.add(value); } convertedProperties.put(propertyQName, list); } else { if (logger.isInfoEnabled()) { logger.info("enableStringTagging is false and could not convert " + propertyQName.toString() + ": " + e.getMessage()); } } } else { if (failOnTypeConversion) { throw e; } } } } // Done return convertedProperties; } /** * Convert a collection of date String to Date objects */ private Collection makeDates(Collection dateStrs) { List dates = new ArrayList(dateStrs.size()); for (String dateStr : dateStrs) { Date date = makeDate(dateStr); dates.add(date); } return dates; } /** * Convert a date String to a Date object */ protected Date makeDate(String dateStr) { if (dateStr == null || dateStr.length() == 0) { return null; } Date date = null; try { date = DefaultTypeConverter.INSTANCE.convert(Date.class, dateStr); } catch (TypeConversionException e) { // Try one of the other formats for (DateFormat df : this.supportedDateFormats) { try { date = df.parse(dateStr); } catch (ParseException ee) { // Didn't work } } if (date == null) { // Still no luck throw new TypeConversionException("Unable to convert string to date: " + dateStr); } } return date; } /** * Adds a value to the map, conserving null values. Values are converted to null if: *
    *
  • it is an empty string value after trimming
  • *
  • it is an empty collection
  • *
  • it is an empty array
  • *
* String values are trimmed before being put into the map. * Otherwise, it is up to the extracter to ensure that the value is a Serializable. * It is not appropriate to implicitly convert values in order to make them Serializable * - the best conversion method will depend on the value's specific meaning. * * @param key the destination key * @param value the serializable value * @param destination the map to put values into * @return Returns true if set, otherwise false */ protected boolean putRawValue(String key, Serializable value, Map destination) { if (value == null) { // Just keep this } else if (value instanceof String) { String valueStr = ((String) value).trim(); if (valueStr.length() == 0) { value = null; } else { if(valueStr.indexOf("\u0000") != -1) { valueStr = valueStr.replaceAll("\u0000", ""); } // Keep the trimmed value value = valueStr; } } else if (value instanceof Collection) { Collection valueCollection = (Collection) value; if (valueCollection.isEmpty()) { value = null; } } else if (value.getClass().isArray()) { if (Array.getLength(value) == 0) { value = null; } } // It passed all the tests destination.put(key, value); return true; } /** * Helper method to fetch a clean map into which raw values can be dumped. * * @return Returns an empty map */ protected final Map newRawMap() { return new HashMap(17); } /** * This method provides a best guess of where to store the values extracted * from the documents. The list of properties mapped by default need not * include all properties extracted from the document; just the obvious set of mappings * need be supplied. * Implementations must either provide the default mapping properties in the expected * location or override the method to provide the default mapping. *

* The default implementation looks for the default mapping file in the location * given by the class name and .properties. If the extracter's class is * x.y.z.MyExtracter then the default properties will be picked up at * classpath:/alfresco/metadata/MyExtracter.properties. * The previous location of classpath:/x/y/z/MyExtracter.properties is * still supported but may be removed in a future release. * Inner classes are supported, but the '$' in the class name is replaced with '-', so * default properties for x.y.z.MyStuff$MyExtracter will be located using * classpath:/alfresco/metadata/MyStuff-MyExtracter.properties. *

* The default mapping implementation should include thorough Javadocs so that the * system administrators can accurately determine how to best enhance or override the * default mapping. *

* If the default mapping is declared in a properties file other than the one named after * the class, then the {@link #readMappingProperties(String)} method can be used to quickly * generate the return value: *


     *      protected Map<> getDefaultMapping()
     *      {
     *          return readMappingProperties(DEFAULT_MAPPING);
     *      }
     * 
* The map can also be created in code either statically or during the call. * * @return Returns the default, static mapping. It may not be null. * * @see #setInheritDefaultMapping(boolean inherit) */ protected Map> getDefaultMapping() { AlfrescoRuntimeException metadataLocationReadException = null; try { // Can't use getSimpleName here because we lose inner class $ processing String className = this.getClass().getName(); String shortClassName = className.split("\\.")[className.split("\\.").length - 1]; // Replace $ shortClassName = shortClassName.replace('$', '-'); // Append .properties String metadataPropertiesUrl = "alfresco/metadata/" + shortClassName + ".properties"; // Attempt to load the properties return readMappingProperties(metadataPropertiesUrl); } catch (AlfrescoRuntimeException e) { // We'll save this to throw at someone later metadataLocationReadException = e; } // Try package location try { String canonicalClassName = this.getClass().getName(); // Replace $ canonicalClassName = canonicalClassName.replace('$', '-'); // Replace . canonicalClassName = canonicalClassName.replace('.', '/'); // Append .properties String packagePropertiesUrl = canonicalClassName + ".properties"; // Attempt to load the properties return readMappingProperties(packagePropertiesUrl); } catch (AlfrescoRuntimeException e) { // Not found in either location, but we want to throw the error for the new metadata location throw metadataLocationReadException; } } /** * This method provides a best guess of what model properties should be embedded * in content. The list of properties mapped by default need not * include all properties to be embedded in the document; just the obvious set of mappings * need be supplied. * Implementations must either provide the default mapping properties in the expected * location or override the method to provide the default mapping. *

* The default implementation looks for the default mapping file in the location * given by the class name and .embed.properties. If the extracter's class is * x.y.z.MyExtracter then the default properties will be picked up at * classpath:/x/y/z/MyExtracter.embed.properties. * Inner classes are supported, but the '$' in the class name is replaced with '-', so * default properties for x.y.z.MyStuff$MyExtracter will be located using * x.y.z.MyStuff-MyExtracter.embed.properties. *

* The default mapping implementation should include thorough Javadocs so that the * system administrators can accurately determine how to best enhance or override the * default mapping. *

* If the default mapping is declared in a properties file other than the one named after * the class, then the {@link #readEmbedMappingProperties(String)} method can be used to quickly * generate the return value: *


     *      protected Map<> getDefaultMapping()
     *      {
     *          return readEmbedMappingProperties(DEFAULT_MAPPING);
     *      }
     * 
* The map can also be created in code either statically or during the call. *

* If no embed mapping properties file is found a reverse of the extract * mapping in {@link #getDefaultMapping()} will be assumed with the first QName in each * value used as the key for this mapping and a last win approach for duplicates. * * @return Returns the default, static embed mapping. It may not be null. * * @see #setInheritDefaultMapping(boolean inherit) */ protected Map> getDefaultEmbedMapping() { Map> embedMapping = null; String metadataPropertiesUrl = null; try { // Can't use getSimpleName here because we lose inner class $ processing String className = this.getClass().getName(); String shortClassName = className.split("\\.")[className.split("\\.").length - 1]; // Replace $ shortClassName = shortClassName.replace('$', '-'); // Append .properties metadataPropertiesUrl = "alfresco/metadata/" + shortClassName + ".embed.properties"; // Attempt to load the properties embedMapping = readEmbedMappingProperties(metadataPropertiesUrl); } catch (AlfrescoRuntimeException e) { // No embed mapping found at default location } // Try package location try { String canonicalClassName = this.getClass().getName(); // Replace $ canonicalClassName = canonicalClassName.replace('$', '-'); // Replace . canonicalClassName = canonicalClassName.replace('.', '/'); // Append .properties String packagePropertiesUrl = canonicalClassName + ".embed.properties"; // Attempt to load the properties embedMapping = readEmbedMappingProperties(packagePropertiesUrl); } catch (AlfrescoRuntimeException e) { // No embed mapping found at legacy location } if (embedMapping == null) { if (logger.isDebugEnabled()) { logger.debug("No explicit embed mapping properties found at: " + metadataPropertiesUrl + ", assuming reverse of extract mapping"); } Map> extractMapping = this.mapping; if (extractMapping == null || extractMapping.size() == 0) { extractMapping = getDefaultMapping(); } embedMapping = new HashMap>(extractMapping.size()); for (String metadataKey : extractMapping.keySet()) { if (extractMapping.get(metadataKey) != null && extractMapping.get(metadataKey).size() > 0) { QName modelProperty = extractMapping.get(metadataKey).iterator().next(); Set metadataKeys = embedMapping.get(modelProperty); if (metadataKeys == null) { metadataKeys = new HashSet(1); embedMapping.put(modelProperty, metadataKeys); } metadataKeys.add(metadataKey); if (logger.isTraceEnabled()) { logger.trace("Added mapping from " + modelProperty + " to " + metadataKeys.toString()); } } } } return embedMapping; } /** * Override to provide the raw extracted metadata values. An extracter should extract * as many of the available properties as is realistically possible. Even if the * {@link #getDefaultMapping() default mapping} doesn't handle all properties, it is * possible for each instance of the extracter to be configured differently and more or * less of the properties may be used in different installations. *

* Raw values must not be trimmed or removed for any reason. Null values and empty * strings are *

    *
  • Null: Removed
  • *
  • Empty String: Passed to the {@link OverwritePolicy}
  • *
  • Non Serializable: Converted to String or fails if that is not possible
  • *
*

* Properties extracted and their meanings and types should be thoroughly described in * the class-level javadocs of the extracter implementation, for example: *

     * editor: - the document editor        -->  cm:author
     * title:  - the document title         -->  cm:title
     * user1:  - the document summary
     * user2:  - the document description   -->  cm:description
     * user3:  -
     * user4:  -
     * 
* * @param reader the document to extract the values from. This stream provided by * the reader must be closed if accessed directly. * @return Returns a map of document property values keyed by property name. * @throws All exception conditions can be handled. * * @see #getDefaultMapping() */ protected abstract Map extractRaw(ContentReader reader) throws Throwable; /** * Override to embed metadata values. An extracter should embed * as many of the available properties as is realistically possible. Even if the * {@link #getDefaultEmbedMapping() default mapping} doesn't handle all properties, it is * possible for each instance of the extracter to be configured differently and more or * less of the properties may be used in different installations. * * @param metadata the metadata keys and values to embed in the content file * @param reader the reader for the original document. This stream provided by * the reader must be closed if accessed directly. * @param writer the writer for the document to embed the values in. This stream provided by * the writer must be closed if accessed directly. * @throws All exception conditions can be handled. * * @see #getDefaultEmbedMapping() */ protected void embedInternal(Map metadata, ContentReader reader, ContentWriter writer) throws Throwable { // TODO make this an abstract method once more extracters support embedding } }