/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see .
*/
package org.alfresco.repo.content.metadata;
import java.io.InputStream;
import java.io.Serializable;
import java.lang.reflect.Array;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.StringTokenizer;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
import org.alfresco.service.cmr.dictionary.DictionaryService;
import org.alfresco.service.cmr.dictionary.PropertyDefinition;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.MimetypeService;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
import org.alfresco.service.namespace.InvalidQNameException;
import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.xmlbeans.impl.xb.xsdschema.All;
import org.springframework.extensions.surf.util.ISO8601DateFormat;
/**
* Support class for metadata extracters that support dynamic and config-driven
* mapping between extracted values and model properties. Extraction is broken
* up into two phases:
*
*
Extract ALL available metadata from the document.
*
Translate the metadata into system properties.
*
*
* Migrating an existing extracter to use this class is straightforward:
*
*
* Construct the extracter providing a default set of supported mimetypes to this
* implementation. This can be overwritten with configurations.
*
*
* Implement the {@link extractInternal} method. This now returns a raw map of extracted
* values keyed by document-specific property names. The trimPut method has
* been replaced with an equivalent {@link #putRawValue(String, Serializable, Map)}.
*
*
* Provide the default mapping of the document-specific properties to system-specific
* properties as describe by the {@link #getDefaultMapping()} method. The simplest
* is to provide the default mapping in a correlated .properties file.
*
*
* Document, in the class-level javadoc, all the available properties that are extracted
* along with their approximate meanings. Add to this, the default mappings.
*
*
*
* @see #getDefaultMapping()
* @see #extractRaw(ContentReader)
* @see #setMapping(Map)
*
* @since 2.1
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/
abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter
{
public static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix.";
private static final String ERR_TYPE_CONVERSION = "metadata.extraction.err.type_conversion";
protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class);
private MetadataExtracterRegistry registry;
private MimetypeService mimetypeService;
private DictionaryService dictionaryService;
private boolean initialized;
private Set supportedMimetypes;
private OverwritePolicy overwritePolicy;
private boolean failOnTypeConversion;
protected Set supportedDateFormats = new HashSet(0);
private Map> mapping;
private boolean inheritDefaultMapping;
/**
* Default constructor. If this is called, then {@link #isSupported(String)} should
* be implemented. This is useful when the list of supported mimetypes is not known
* when the instance is constructed. Alternatively, once the set becomes known, call
* {@link #setSupportedMimetypes(Collection)}.
*
* @see #isSupported(String)
* @see #setSupportedMimetypes(Collection)
*/
protected AbstractMappingMetadataExtracter()
{
this(Collections.emptySet());
}
/**
* Constructor that can be used when the list of supported mimetypes is known up front.
*
* @param supportedMimetypes the set of mimetypes supported by default
*/
protected AbstractMappingMetadataExtracter(Set supportedMimetypes)
{
this.supportedMimetypes = supportedMimetypes;
// Set defaults
overwritePolicy = OverwritePolicy.PRAGMATIC;
failOnTypeConversion = true;
mapping = null; // The default will be fetched
inheritDefaultMapping = false; // Any overrides are complete
initialized = false;
}
/**
* Set the registry to register with. If this is not set, then the default
* initialization will not auto-register the extracter for general use. It
* can still be used directly.
*
* @param registry a metadata extracter registry
*/
public void setRegistry(MetadataExtracterRegistry registry)
{
this.registry = registry;
}
/**
* @param mimetypeService the mimetype service. Set this if required.
*/
public void setMimetypeService(MimetypeService mimetypeService)
{
this.mimetypeService = mimetypeService;
}
/**
* @return Returns the mimetype helper
*/
protected MimetypeService getMimetypeService()
{
return mimetypeService;
}
/**
* @param dictionaryService the dictionary service to determine which data conversions are necessary
*/
public void setDictionaryService(DictionaryService dictionaryService)
{
this.dictionaryService = dictionaryService;
}
/**
* Set the mimetypes that are supported by the extracter.
*
* @param supportedMimetypes
*/
public void setSupportedMimetypes(Collection supportedMimetypes)
{
this.supportedMimetypes.clear();
this.supportedMimetypes.addAll(supportedMimetypes);
}
/**
* {@inheritDoc}
*
* @see #setSupportedMimetypes(Collection)
*/
public boolean isSupported(String sourceMimetype)
{
return supportedMimetypes.contains(sourceMimetype);
}
/**
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
* @return Returns 1.0 if the mimetype is supported, otherwise 0.0
*
* @see #isSupported(String)
*/
public double getReliability(String mimetype)
{
return isSupported(mimetype) ? 1.0D : 0.0D;
}
/**
* Set the policy to use when existing values are encountered. Depending on how the extractor
* is called, this may not be relevant, i.e an empty map of existing properties may be passed
* in by the client code, which may follow its own overwrite strategy.
*
* @param overwritePolicy the policy to apply when there are existing system properties
*/
public void setOverwritePolicy(OverwritePolicy overwritePolicy)
{
this.overwritePolicy = overwritePolicy;
}
/**
* Set the policy to use when existing values are encountered. Depending on how the extractor
* is called, this may not be relevant, i.e an empty map of existing properties may be passed
* in by the client code, which may follow its own overwrite strategy.
*
* @param overwritePolicyStr the policy to apply when there are existing system properties
*/
public void setOverwritePolicy(String overwritePolicyStr)
{
this.overwritePolicy = OverwritePolicy.valueOf(overwritePolicyStr);
}
/**
* Set whether the extractor should discard metadata that fails to convert to the target type
* defined in the data dictionary model. This is true by default i.e. if the data
* extracted is not compatible with the target model then the extraction will fail. If this is
* false then any extracted data that fails to convert will be discarded.
*
* @param failOnTypeConversion false to discard properties that can't get converted
* to the dictionary-defined type, or true (default)
* to fail the extraction if the type doesn't convert
*/
public void setFailOnTypeConversion(boolean failOnTypeConversion)
{
this.failOnTypeConversion = failOnTypeConversion;
}
/**
* Set the date formats, over and above the {@link ISO8601DateFormat ISO8601 format}, that will
* be supported for string to date conversions. The supported syntax is described by the
* {@link http://java.sun.com/j2se/1.5.0/docs/api/java/text/SimpleDateFormat.html SimpleDateFormat Javadocs}.
*
* @param supportedDateFormats a list of supported date formats.
*/
public void setSupportedDateFormats(List supportedDateFormats)
{
this.supportedDateFormats = new HashSet(5);
for (String dateFormatStr : supportedDateFormats)
{
try
{
/**
* Regional date format
*/
DateFormat df = new SimpleDateFormat(dateFormatStr);
this.supportedDateFormats.add(df);
/**
*
*/
/**
* Date format can be locale specific - make sure English format always works
*/
/*
* TODO MER 25 May 2010 - Added this as a quick fix for IMAP date parsing which is always
* English regardless of Locale. Some more thought and/or code is required to configure
* the relationship between properties, format and locale.
*/
DateFormat englishFormat = new SimpleDateFormat(dateFormatStr, Locale.US);
this.supportedDateFormats.add(englishFormat);
}
catch (Throwable e)
{
// No good
throw new AlfrescoRuntimeException("Unable to set supported date format: " + dateFormatStr, e);
}
}
}
/**
* Set if the property mappings augment or override the mapping generically provided by the
* extracter implementation. The default is false, i.e. any mapping set completely
* replaces the {@link #getDefaultMapping() default mappings}.
*
* @param inheritDefaultMapping true to add the configured mapping
* to the list of default mappings.
*
* @see #getDefaultMapping()
* @see #setMapping(Map)
* @see #setMappingProperties(Properties)
*/
public void setInheritDefaultMapping(boolean inheritDefaultMapping)
{
this.inheritDefaultMapping = inheritDefaultMapping;
}
/**
* Set the mapping from document metadata to system metadata. It is possible to direct
* an extracted document property to several system properties. The conversion between
* the document property types and the system property types will be done by the
* {@link org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter default converter}.
*
* @param mapping a mapping from document metadata to system metadata
*/
public void setMapping(Map> mapping)
{
this.mapping = mapping;
}
/**
* Set the properties that contain the mapping from document metadata to system metadata.
* This is an alternative to the {@link #setMapping(Map)} method. Any mappings already
* present will be cleared out.
*
* The property mapping is of the form:
*
* The mapping can therefore be from a single document property onto several system properties.
*
* @param mappingProperties the properties that map document properties to system properties
*/
public void setMappingProperties(Properties mappingProperties)
{
mapping = readMappingProperties(mappingProperties);
}
/**
* Helper method for derived classes to obtain the mappings that will be applied to raw
* values. This should be called after initialization in order to guarantee the complete
* map is given.
*
* Normally, the list of properties that can be extracted from a document is fixed and
* well-known - in that case, just extract everything. But Some implementations may have
* an extra, indeterminate set of values available for extraction. If the extraction of
* these runtime parameters is expensive, then the keys provided by the return value can
* be used to extract values from the documents. The metadata extraction becomes fully
* configuration-driven, i.e. declaring further mappings will result in more values being
* extracted from the documents.
*
* Most extractors will not be using this method. For an example of its use, see the
* {@linkplain OpenDocumentMetadataExtracter OpenDocument extractor}, which uses the mapping
* to select specific user properties from a document.
*/
protected final Map> getMapping()
{
if (!initialized)
{
throw new UnsupportedOperationException("The complete mapping is only available after initialization.");
}
return Collections.unmodifiableMap(mapping);
}
/**
* A utility method to read mapping properties from a resource file and convert to the map form.
*
* @param propertiesUrl A standard Properties file URL location
*
* @see #setMappingProperties(Properties)
*/
protected Map> readMappingProperties(String propertiesUrl)
{
InputStream is = null;
try
{
is = getClass().getClassLoader().getResourceAsStream(propertiesUrl);
if(is == null)
{
throw new AlfrescoRuntimeException(
"Metadata Extracter mapping properties not found: \n" +
" Extracter: " + this + "\n" +
" Bundle: " + propertiesUrl);
}
Properties props = new Properties();
props.load(is);
// Process it
Map> map = readMappingProperties(props);
// Done
if (logger.isDebugEnabled())
{
logger.debug("Loaded mapping properties from resource: " + propertiesUrl);
}
return map;
}
catch (Throwable e)
{
throw new AlfrescoRuntimeException(
"Unable to load properties file to read extracter mapping properties: \n" +
" Extracter: " + this + "\n" +
" Bundle: " + propertiesUrl,
e);
}
finally
{
if (is != null)
{
try { is.close(); } catch (Throwable e) {}
}
}
}
/**
* A utility method to convert mapping properties to the Map form.
*
* @see #setMappingProperties(Properties)
*/
protected Map> readMappingProperties(Properties mappingProperties)
{
Map namespacesByPrefix = new HashMap(5);
// Get the namespaces
for (Map.Entry