/* * Copyright (C) 2005-2007 Alfresco Software Limited. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * As a special exception to the terms and conditions of version 2.0 of * the GPL, you may redistribute this Program in connection with Free/Libre * and Open Source Software ("FLOSS") applications as described in Alfresco's * FLOSS exception. You should have recieved a copy of the text describing * the FLOSS exception, and it is also available here: * http://www.alfresco.com/legal/licensing" */ package org.alfresco.repo.content.metadata; import java.io.InputStream; import java.io.Serializable; import java.lang.reflect.Array; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.StringTokenizer; import org.alfresco.error.AlfrescoRuntimeException; import org.alfresco.service.cmr.dictionary.DataTypeDefinition; import org.alfresco.service.cmr.dictionary.DictionaryService; import org.alfresco.service.cmr.dictionary.PropertyDefinition; import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.MimetypeService; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.cmr.repository.datatype.TypeConversionException; import org.alfresco.service.namespace.InvalidQNameException; import org.alfresco.service.namespace.QName; import org.alfresco.util.ISO8601DateFormat; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Support class for metadata extracters that support dynamic and config-driven * mapping between extracted values and model properties. Extraction is broken * up into two phases: * *

* Migrating an existing extracter to use this class is straightforward: *

* * @see #getDefaultMapping() * @see #extractRaw(ContentReader) * @see #setMapping(Map) * * @since 2.1 * * @author Jesper Steen Møller * @author Derek Hulley */ abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter { public static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix."; private static final String ERR_TYPE_CONVERSION = "metadata.extraction.err.type_conversion"; protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class); private MetadataExtracterRegistry registry; private MimetypeService mimetypeService; private DictionaryService dictionaryService; private boolean initialized; private Set supportedMimetypes; private OverwritePolicy overwritePolicy; private boolean failOnTypeConversion; private Set supportedDateFormats; private Map> mapping; private boolean inheritDefaultMapping; /** * Default constructor. If this is called, then {@link #isSupported(String)} should * be implemented. This is useful when the list of supported mimetypes is not known * when the instance is constructed. Alternatively, once the set becomes known, call * {@link #setSupportedMimetypes(Collection)}. * * @see #isSupported(String) * @see #setSupportedMimetypes(Collection) */ protected AbstractMappingMetadataExtracter() { this(Collections.emptySet()); } /** * Constructor that can be used when the list of supported mimetypes is known up front. * * @param supportedMimetypes the set of mimetypes supported by default */ protected AbstractMappingMetadataExtracter(Set supportedMimetypes) { this.supportedMimetypes = supportedMimetypes; // Set defaults overwritePolicy = OverwritePolicy.PRAGMATIC; failOnTypeConversion = true; supportedDateFormats = new HashSet(0); mapping = null; // The default will be fetched inheritDefaultMapping = false; // Any overrides are complete initialized = false; } /** * Set the registry to register with. If this is not set, then the default * initialization will not auto-register the extracter for general use. It * can still be used directly. * * @param registry a metadata extracter registry */ public void setRegistry(MetadataExtracterRegistry registry) { this.registry = registry; } /** * @param mimetypeService the mimetype service. Set this if required. */ public void setMimetypeService(MimetypeService mimetypeService) { this.mimetypeService = mimetypeService; } /** * @return Returns the mimetype helper */ protected MimetypeService getMimetypeService() { return mimetypeService; } /** * @param dictionaryService the dictionary service to determine which data conversions are necessary */ public void setDictionaryService(DictionaryService dictionaryService) { this.dictionaryService = dictionaryService; } /** * Set the mimetypes that are supported by the extracter. * * @param supportedMimetypes */ public void setSupportedMimetypes(Collection supportedMimetypes) { this.supportedMimetypes.clear(); this.supportedMimetypes.addAll(supportedMimetypes); } /** * {@inheritDoc} * * @see #setSupportedMimetypes(Collection) */ public boolean isSupported(String sourceMimetype) { return supportedMimetypes.contains(sourceMimetype); } /** * @return Returns 1.0 if the mimetype is supported, otherwise 0.0 * * @see #isSupported(String) */ public double getReliability(String mimetype) { return isSupported(mimetype) ? 1.0D : 0.0D; } /** * Set the policy to use when existing values are encountered. Depending on how the extracer * is called, this may not be relevant, i.e an empty map of existing properties may be passed * in by the client code, which may follow its own overwrite strategy. * * @param overwritePolicy the policy to apply when there are existing system properties */ public void setOverwritePolicy(OverwritePolicy overwritePolicy) { this.overwritePolicy = overwritePolicy; } /** * Set the policy to use when existing values are encountered. Depending on how the extracer * is called, this may not be relevant, i.e an empty map of existing properties may be passed * in by the client code, which may follow its own overwrite strategy. * * @param overwritePolicyStr the policy to apply when there are existing system properties */ public void setOverwritePolicy(String overwritePolicyStr) { this.overwritePolicy = OverwritePolicy.valueOf(overwritePolicyStr); } /** * Set whether the extractor should discard metadata that fails to convert to the target type * defined in the data dictionary model. This is true by default i.e. if the data * extracted is not compatible with the target model then the extraction will fail. If this is * false then any extracted data that fails to convert will be discarded. * * @param failOnTypeConversion false to discard properties that can't get converted * to the dictionary-defined type, or true (default) * to fail the extraction if the type doesn't convert */ public void setFailOnTypeConversion(boolean failOnTypeConversion) { this.failOnTypeConversion = failOnTypeConversion; } /** * Set the date formats, over and above the {@link ISO8601DateFormat ISO8601 format}, that will * be supported for string to date conversions. The supported syntax is described by the * {@link http://java.sun.com/j2se/1.5.0/docs/api/java/text/SimpleDateFormat.html SimpleDateFormat Javadocs}. * * @param supportedDateFormats a list of supported date formats. */ public void setSupportedDateFormats(List supportedDateFormats) { this.supportedDateFormats = new HashSet(5); for (String dateFormatStr : supportedDateFormats) { try { DateFormat df = new SimpleDateFormat(dateFormatStr); this.supportedDateFormats.add(df); } catch (Throwable e) { // No good throw new AlfrescoRuntimeException("Unable to set supported date format: " + dateFormatStr, e); } } } /** * Set if the property mappings augment or override the mapping generically provided by the * extracter implementation. The default is false, i.e. any mapping set completely * replaces the {@link #getDefaultMapping() default mappings}. * * @param inheritDefaultMapping true to add the configured mapping * to the list of default mappings. * * @see #getDefaultMapping() * @see #setMapping(Map) * @see #setMappingProperties(Properties) */ public void setInheritDefaultMapping(boolean inheritDefaultMapping) { this.inheritDefaultMapping = inheritDefaultMapping; } /** * Set the mapping from document metadata to system metadata. It is possible to direct * an extracted document property to several system properties. The conversion between * the document property types and the system property types will be done by the * {@link org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter default converter}. * * @param mapping a mapping from document metadata to system metadata */ public void setMapping(Map> mapping) { this.mapping = mapping; } /** * Set the properties that contain the mapping from document metadata to system metadata. * This is an alternative to the {@link #setMapping(Map)} method. Any mappings already * present will be cleared out. * * The property mapping is of the form: *
     * # Namespaces prefixes
     * namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
     * namespace.prefix.my=http://www....com/alfresco/1.0
     * 
     * # Mapping
     * editor=cm:author, my:editor
     * title=cm:title
     * user1=cm:summary
     * user2=cm:description
     * 
* The mapping can therefore be from a single document property onto several system properties. * * @param mappingProperties the properties that map document properties to system properties */ public void setMappingProperties(Properties mappingProperties) { mapping = readMappingProperties(mappingProperties); } /** * Helper method for derived classes to obtain the mappings that will be applied to raw * values. This should be called after initialization in order to guarantee the complete * map is given. *

* Normally, the list of properties that can be extracted from a document is fixed and * well-known - in that case, just extract everything. But Some implementations may have * an extra, indeterminate set of values available for extraction. If the extraction of * these runtime parameters is expensive, then the keys provided by the return value can * be used to extract values from the documents. The metadata extraction becomes fully * configuration-driven, i.e. declaring further mappings will result in more values being * extracted from the documents. *

* Most extractors will not be using this method. For an example of its use, see the * {@linkplain OpenDocumentMetadataExtracter OpenDocument extractor}, which uses the mapping * to select specific user properties from a document. */ protected final Map> getMapping() { if (!initialized) { throw new UnsupportedOperationException("The complete mapping is only available after initialization."); } return Collections.unmodifiableMap(mapping); } /** * A utility method to read mapping properties from a resource file and convert to the map form. * * @param propertiesUrl A standard Properties file URL location * * @see #setMappingProperties(Properties) */ protected Map> readMappingProperties(String propertiesUrl) { InputStream is = null; try { is = getClass().getClassLoader().getResourceAsStream(propertiesUrl); if(is == null) { throw new AlfrescoRuntimeException( "Metadata Extracter mapping properties not found: \n" + " Extracter: " + this + "\n" + " Bundle: " + propertiesUrl); } Properties props = new Properties(); props.load(is); // Process it Map> map = readMappingProperties(props); // Done if (logger.isDebugEnabled()) { logger.debug("Loaded mapping properties from resource: " + propertiesUrl); } return map; } catch (Throwable e) { throw new AlfrescoRuntimeException( "Unable to load properties file to read extracter mapping properties: \n" + " Extracter: " + this + "\n" + " Bundle: " + propertiesUrl, e); } finally { if (is != null) { try { is.close(); } catch (Throwable e) {} } } } /** * A utility method to convert mapping properties to the Map form. * * @see #setMappingProperties(Properties) */ @SuppressWarnings("unchecked") protected Map> readMappingProperties(Properties mappingProperties) { Map namespacesByPrefix = new HashMap(5); // Get the namespaces for (Map.Entry entry : mappingProperties.entrySet()) { String propertyName = (String) entry.getKey(); if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX)) { String prefix = propertyName.substring(17); String namespace = (String) entry.getValue(); namespacesByPrefix.put(prefix, namespace); } } // Create the mapping Map> convertedMapping = new HashMap>(17); for (Map.Entry entry : mappingProperties.entrySet()) { String documentProperty = (String) entry.getKey(); String qnamesStr = (String) entry.getValue(); if (documentProperty.startsWith(NAMESPACE_PROPERTY_PREFIX)) { // Ignore these now continue; } // Create the entry Set qnames = new HashSet(3); convertedMapping.put(documentProperty, qnames); // The to value can be a list of QNames StringTokenizer tokenizer = new StringTokenizer(qnamesStr, ","); while (tokenizer.hasMoreTokens()) { String qnameStr = tokenizer.nextToken().trim(); // Check if we need to resolve a namespace reference int index = qnameStr.indexOf(QName.NAMESPACE_PREFIX); if (index > -1 && qnameStr.charAt(0) != QName.NAMESPACE_BEGIN) { String prefix = qnameStr.substring(0, index); String suffix = qnameStr.substring(index + 1); // It is prefixed String uri = namespacesByPrefix.get(prefix); if (uri == null) { throw new AlfrescoRuntimeException( "No prefix mapping for extracter property mapping: \n" + " Extracter: " + this + "\n" + " Mapping: " + entry); } qnameStr = QName.NAMESPACE_BEGIN + uri + QName.NAMESPACE_END + suffix; } try { QName qname = QName.createQName(qnameStr); // Add it to the mapping qnames.add(qname); } catch (InvalidQNameException e) { throw new AlfrescoRuntimeException( "Can't create metadata extracter property mapping: \n" + " Extracter: " + this + "\n" + " Mapping: " + entry); } } if (logger.isDebugEnabled()) { logger.debug("Added mapping from " + documentProperty + " to " + qnames); } } // Done return convertedMapping; } /** * Registers this instance of the extracter with the registry. This will call the * {@link #init()} method and then register if the registry is available. * * @see #setRegistry(MetadataExtracterRegistry) * @see #init() */ public final void register() { init(); // Register the extracter, if necessary if (registry != null) { registry.register(this); } } /** * Provides a hook point for implementations to perform initialization. The base * implementation must be invoked or the extracter will fail during extraction. * The {@link #getDefaultMapping() default mappings} will be requested during * initialization. */ protected void init() { Map> defaultMapping = getDefaultMapping(); if (defaultMapping == null) { throw new AlfrescoRuntimeException("The metadata extracter must provide a default mapping: " + this); } // Was a mapping explicitly provided if (mapping == null) { // No mapping, so use the default mapping = defaultMapping; } else if (inheritDefaultMapping) { // Merge the default mapping into the configured mapping for (String documentKey : defaultMapping.keySet()) { Set systemQNames = mapping.get(documentKey); if (systemQNames == null) { systemQNames = new HashSet(3); mapping.put(documentKey, systemQNames); } Set defaultQNames = defaultMapping.get(documentKey); systemQNames.addAll(defaultQNames); } } // The configured mappings are empty, but there were default mappings if (mapping.size() == 0 && defaultMapping.size() > 0) { logger.warn( "There are no property mappings for the metadata extracter.\n" + " Nothing will be extracted by: " + this); } // Done initialized = true; } /** {@inheritDoc} */ public long getExtractionTime() { return 1000L; } /** * Checks if the mimetype is supported. * * @param reader the reader to check * @throws AlfrescoRuntimeException if the mimetype is not supported */ protected void checkIsSupported(ContentReader reader) { String mimetype = reader.getMimetype(); if (!isSupported(mimetype)) { throw new AlfrescoRuntimeException( "Metadata extracter does not support mimetype: \n" + " reader: " + reader + "\n" + " supported: " + supportedMimetypes + "\n" + " extracter: " + this); } } /** * {@inheritDoc} */ public final Map extract(ContentReader reader, Map destination) { return extract(reader, this.overwritePolicy, destination, this.mapping); } /** * {@inheritDoc} */ public final Map extract( ContentReader reader, OverwritePolicy overwritePolicy, Map destination) { return extract(reader, overwritePolicy, destination, this.mapping); } /** * {@inheritDoc} */ public Map extract( ContentReader reader, OverwritePolicy overwritePolicy, Map destination, Map> mapping) { // Done if (logger.isDebugEnabled()) { logger.debug("Starting metadata extraction: \n" + " reader: " + reader + "\n" + " extracter: " + this); } if (!initialized) { throw new AlfrescoRuntimeException( "Metadata extracter not initialized.\n" + " Call the 'register' method on: " + this + "\n" + " Implementations of the 'init' method must call the base implementation."); } // check the reliability checkIsSupported(reader); Map changedProperties = null; try { Map rawMetadata = null; // Check that the content has some meat if (reader.getSize() > 0 && reader.exists()) { rawMetadata = extractRaw(reader); } else { rawMetadata = new HashMap(1); } // Convert to system properties (standalone) Map systemProperties = mapRawToSystem(rawMetadata); // Convert the properties according to the dictionary types systemProperties = convertSystemPropertyValues(systemProperties); // Now use the proper overwrite policy changedProperties = overwritePolicy.applyProperties(systemProperties, destination); } catch (Throwable e) { throw new ContentIOException("Metadata extraction failed: \n" + " reader: " + reader, e); } finally { // check that the reader was closed (if used) if (reader.isChannelOpen()) { logger.error("Content reader not closed by metadata extracter: \n" + " reader: " + reader + "\n" + " extracter: " + this); } } // Done if (logger.isDebugEnabled()) { logger.debug("Completed metadata extraction: \n" + " reader: " + reader + "\n" + " extracter: " + this + "\n" + " changed: " + changedProperties); } return changedProperties; } /** * * @param rawMetadata Metadata keyed by document properties * @return Returns the metadata keyed by the system properties */ private Map mapRawToSystem(Map rawMetadata) { Map systemProperties = new HashMap(rawMetadata.size() * 2 + 1); for (Map.Entry entry : rawMetadata.entrySet()) { String documentKey = entry.getKey(); // Check if there is a mapping for this if (!mapping.containsKey(documentKey)) { // No mapping - ignore continue; } Serializable documentValue = entry.getValue(); Set systemQNames = mapping.get(documentKey); for (QName systemQName : systemQNames) { systemProperties.put(systemQName, documentValue); } } // Done if (logger.isDebugEnabled()) { logger.debug( "Converted extracted raw values to system values: \n" + " Raw Properties: " + rawMetadata + "\n" + " System Properties: " + systemProperties); } return systemProperties; } /** * Converts all values according to their dictionary-defined type. This uses the * {@link #setFailOnTypeConversion(boolean) failOnTypeConversion flag} to determine how failures * are handled i.e. if values fail to convert, the process may discard the property. * * @param systemProperties the values keyed to system property names * @return Returns a modified map of properties that have been converted. */ @SuppressWarnings("unchecked") private Map convertSystemPropertyValues(Map systemProperties) { Map convertedProperties = new HashMap(systemProperties.size() + 7); for (Map.Entry entry : systemProperties.entrySet()) { QName propertyQName = entry.getKey(); Serializable propertyValue = entry.getValue(); // Get the property definition PropertyDefinition propertyDef = (dictionaryService == null) ? null : dictionaryService.getProperty(propertyQName); if (propertyDef == null) { // There is nothing in the DD about this so just transfer it convertedProperties.put(propertyQName, propertyValue); continue; } // It is in the DD, so attempt the conversion DataTypeDefinition propertyTypeDef = propertyDef.getDataType(); Serializable convertedPropertyValue = null; try { // Attempt to make any date conversions if (propertyTypeDef.getName().equals(DataTypeDefinition.DATE) || propertyTypeDef.getName().equals(DataTypeDefinition.DATETIME)) { if (propertyValue instanceof Collection) { convertedPropertyValue = (Serializable) makeDates((Collection) propertyValue); } else if (propertyValue instanceof String) { convertedPropertyValue = makeDate((String) propertyValue); } } else { if (propertyValue instanceof Collection) { convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert( propertyTypeDef, (Collection) propertyValue); } else { convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert( propertyTypeDef, propertyValue); } } convertedProperties.put(propertyQName, convertedPropertyValue); } catch (TypeConversionException e) { // Do we just absorb this or is it a problem? if (failOnTypeConversion) { throw AlfrescoRuntimeException.create( e, ERR_TYPE_CONVERSION, this, propertyQName, propertyTypeDef.getName(), propertyValue); } } } // Done return convertedProperties; } /** * Convert a collection of date String to Date objects */ private Collection makeDates(Collection dateStrs) { List dates = new ArrayList(dateStrs.size()); for (String dateStr : dateStrs) { Date date = makeDate(dateStr); dates.add(date); } return dates; } /** * Convert a date String to a Date object */ private Date makeDate(String dateStr) { Date date = null; try { date = DefaultTypeConverter.INSTANCE.convert(Date.class, dateStr); } catch (TypeConversionException e) { // Try one of the other formats for (DateFormat df : this.supportedDateFormats) { try { date = df.parse(dateStr); } catch (ParseException ee) { // Didn't work } } if (date == null) { // Still no luck throw new TypeConversionException("Unable to convert string to date: " + dateStr); } } return date; } /** * Adds a value to the map if it is non-trivial. A value is trivial if *

    *
  • it is null
  • *
  • it is an empty string value after trimming
  • *
  • it is an empty collection
  • *
  • it is an empty array
  • *
* String values are trimmed before being put into the map. * Otherwise, it is up to the extracter to ensure that the value is a Serializable. * It is not appropriate to implicitly convert values in order to make them Serializable * - the best conversion method will depend on the value's specific meaning. * * @param key the destination key * @param value the serializable value * @param destination the map to put values into * @return Returns true if set, otherwise false */ @SuppressWarnings("unchecked") protected boolean putRawValue(String key, Serializable value, Map destination) { if (value == null) { return false; } if (value instanceof String) { String valueStr = ((String) value).trim(); if (valueStr.length() == 0) { return false; } else { // Keep the trimmed value value = valueStr; } } else if (value instanceof Collection) { Collection valueCollection = (Collection) value; if (valueCollection.isEmpty()) { return false; } } else if (value.getClass().isArray()) { if (Array.getLength(value) == 0) { return false; } } // It passed all the tests destination.put(key, value); return true; } /** * Helper method to fetch a clean map into which raw values can be dumped. * * @return Returns an empty map */ protected final Map newRawMap() { return new HashMap(17); } /** * This method provides a best guess of where to store the values extracted * from the documents. The list of properties mapped by default need not * include all properties extracted from the document; just the obvious set of mappings * need be supplied. * Implementations must either provide the default mapping properties in the expected * location or override the method to provide the default mapping. *

* The default implementation looks for the default mapping file in the location * given by the class name and .properties. If the extracter's class is * x.y.z.MyExtracter then the default properties will be picked up at * classpath:/x/y/z/MyExtracter.properties. * Inner classes are supported, but the '$' in the class name is replaced with '-', so * default properties for x.y.z.MyStuff$MyExtracter will be located using * x.y.z.MyStuff-MyExtracter.properties. *

* The default mapping implementation should include thorough Javadocs so that the * system administrators can accurately determine how to best enhance or override the * default mapping. *

* If the default mapping is declared in a properties file other than the one named after * the class, then the {@link #readMappingProperties(String)} method can be used to quickly * generate the return value: *


     *      protected Map<> getDefaultMapping()
     *      {
     *          return readMappingProperties(DEFAULT_MAPPING);
     *      }
     * 
* The map can also be created in code either statically or during the call. * * @return Returns the default, static mapping. It may not be null. * * @see #setInheritDefaultMapping(boolean inherit) */ protected Map> getDefaultMapping() { String className = this.getClass().getName(); // Replace $ className = className.replace('$', '-'); // Replace . className = className.replace('.', '/'); // Append .properties String propertiesUrl = className + ".properties"; // Attempt to load the properties return readMappingProperties(propertiesUrl); } /** * Override to provide the raw extracted metadata values. An extracter should extract * as many of the available properties as is realistically possible. Even if the * {@link #getDefaultMapping() default mapping} doesn't handle all properties, it is * possible for each instance of the extracter to be configured differently and more or * less of the properties may be used in different installations. *

* Raw values must not be trimmed or removed for any reason. Null values and empty * strings are *

    *
  • Null: Removed
  • *
  • Empty String: Passed to the {@link OverwritePolicy}
  • *
  • Non Serializable: Converted to String or fails if that is not possible
  • *
*

* Properties extracted and their meanings and types should be thoroughly described in * the class-level javadocs of the extracter implementation, for example: *

     * editor: - the document editor        -->  cm:author
     * title:  - the document title         -->  cm:title
     * user1:  - the document summary
     * user2:  - the document description   -->  cm:description
     * user3:  -
     * user4:  -
     * 
* * @param reader the document to extract the values from. This stream provided by * the reader must be closed if accessed directly. * @return Returns a map of document property values keyed by property name. * @throws All exception conditions can be handled. * * @see #getDefaultMapping() */ protected abstract Map extractRaw(ContentReader reader) throws Throwable; }