/* * Copyright (C) 2005-2007 Alfresco Software Limited. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * As a special exception to the terms and conditions of version 2.0 of * the GPL, you may redistribute this Program in connection with Free/Libre * and Open Source Software ("FLOSS") applications as described in Alfresco's * FLOSS exception. You should have recieved a copy of the text describing * the FLOSS exception, and it is also available here: * http://www.alfresco.com/legal/licensing" */ package org.alfresco.repo.content.metadata; import java.io.InputStream; import java.io.Serializable; import java.lang.reflect.Array; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.StringTokenizer; import org.alfresco.error.AlfrescoRuntimeException; import org.alfresco.service.cmr.dictionary.DataTypeDefinition; import org.alfresco.service.cmr.dictionary.DictionaryService; import org.alfresco.service.cmr.dictionary.PropertyDefinition; import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.MimetypeService; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.cmr.repository.datatype.TypeConversionException; import org.alfresco.service.namespace.InvalidQNameException; import org.alfresco.service.namespace.QName; import org.alfresco.util.ISO8601DateFormat; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Support class for metadata extracters that support dynamic and config-driven * mapping between extracted values and model properties. Extraction is broken * up into two phases: *
* Migrating an existing extracter to use this class is straightforward: *
1.0 if the mimetype is supported, otherwise 0.0
     * 
     * @see #isSupported(String)
     */
    public double getReliability(String mimetype)
    {
        return isSupported(mimetype) ? 1.0D : 0.0D;
    }
    /**
     * Set the policy to use when existing values are encountered.  Depending on how the extracer
     * is called, this may not be relevant, i.e an empty map of existing properties may be passed
     * in by the client code, which may follow its own overwrite strategy.
     * 
     * @param overwritePolicy       the policy to apply when there are existing system properties
     */
    public void setOverwritePolicy(OverwritePolicy overwritePolicy)
    {
        this.overwritePolicy = overwritePolicy;
    }
    /**
     * Set the policy to use when existing values are encountered.  Depending on how the extracer
     * is called, this may not be relevant, i.e an empty map of existing properties may be passed
     * in by the client code, which may follow its own overwrite strategy.
     * 
     * @param overwritePolicyStr    the policy to apply when there are existing system properties
     */
    public void setOverwritePolicy(String overwritePolicyStr)
    {
        this.overwritePolicy = OverwritePolicy.valueOf(overwritePolicyStr);
    }
    /**
     * Set whether the extractor should discard metadata that fails to convert to the target type
     * defined in the data dictionary model.  This is true by default i.e. if the data
     * extracted is not compatible with the target model then the extraction will fail.  If this is
     * false then any extracted data that fails to convert will be discarded.
     * 
     * @param failOnTypeConversion      false to discard properties that can't get converted
     *                                  to the dictionary-defined type, or true (default)
     *                                  to fail the extraction if the type doesn't convert
     */
    public void setFailOnTypeConversion(boolean failOnTypeConversion)
    {
        this.failOnTypeConversion = failOnTypeConversion;
    }
    /**
     * Set the date formats, over and above the {@link ISO8601DateFormat ISO8601 format}, that will
     * be supported for string to date conversions.  The supported syntax is described by the
     * {@link http://java.sun.com/j2se/1.5.0/docs/api/java/text/SimpleDateFormat.html SimpleDateFormat Javadocs}.
     * 
     * @param supportedDateFormats      a list of supported date formats.
     */
    public void setSupportedDateFormats(List
     * # Namespaces prefixes
     * namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
     * namespace.prefix.my=http://www....com/alfresco/1.0
     * 
     * # Mapping
     * editor=cm:author, my:editor
     * title=cm:title
     * user1=cm:summary
     * user2=cm:description
     * 
     * The mapping can therefore be from a single document property onto several system properties.
     * 
     * @param mappingProperties     the properties that map document properties to system properties
     */
    public void setMappingProperties(Properties mappingProperties)
    {
        mapping = readMappingProperties(mappingProperties);
    }
    
    /**
     * Helper method for derived classes to obtain the mappings that will be applied to raw
     * values.  This should be called after initialization in order to guarantee the complete
     * map is given.
     * * Normally, the list of properties that can be extracted from a document is fixed and * well-known - in that case, just extract everything. But Some implementations may have * an extra, indeterminate set of values available for extraction. If the extraction of * these runtime parameters is expensive, then the keys provided by the return value can * be used to extract values from the documents. The metadata extraction becomes fully * configuration-driven, i.e. declaring further mappings will result in more values being * extracted from the documents. *
     * Most extractors will not be using this method.  For an example of its use, see the
     * {@linkplain OpenDocumentMetadataExtracter OpenDocument extractor}, which uses the mapping
     * to select specific user properties from a document.
     */
    protected final Map 
     * The default implementation looks for the default mapping file in the location
     * given by the class name and .properties.  If the extracter's class is
     * x.y.z.MyExtracter then the default properties will be picked up at
     * classpath:/x/y/z/MyExtracter.properties.
     * Inner classes are supported, but the '$' in the class name is replaced with '-', so
     * default properties for x.y.z.MyStuff$MyExtracter will be located using
     * x.y.z.MyStuff-MyExtracter.properties.
     *  
     * The default mapping implementation should include thorough Javadocs so that the
     * system administrators can accurately determine how to best enhance or override the
     * default mapping.
     *  
     * If the default mapping is declared in a properties file other than the one named after
     * the class, then the {@link #readMappingProperties(String)} method can be used to quickly
     * generate the return value:
     *  
     * Raw values must not be trimmed or removed for any reason.  Null values and empty
     * strings are 
     *  
     * Properties extracted and their meanings and types should be thoroughly described in
     * the class-level javadocs of the extracter implementation, for example:
     * 
     *   
* String values are trimmed before being put into the map.
     * Otherwise, it is up to the extracter to ensure that the value is a Serializable.
     * It is not appropriate to implicitly convert values in order to make them Serializable
     * - the best conversion method will depend on the value's specific meaning.
     * 
     * @param key           the destination key
     * @param value         the serializable value
     * @param destination   the map to put values into
     * @return              Returns true if set, otherwise false
     */
    @SuppressWarnings("unchecked")
    protected boolean putRawValue(String key, Serializable value, Map
     * The map can also be created in code either statically or during the call.
     * 
     * @return              Returns the default, static mapping.  It may not be null.
     * 
     * @see #setInheritDefaultMapping(boolean inherit)
     */
    protected Map
     *      protected Map<
     *    
*
     * editor: - the document editor        -->  cm:author
     * title:  - the document title         -->  cm:title
     * user1:  - the document summary
     * user2:  - the document description   -->  cm:description
     * user3:  -
     * user4:  -
     * 
     * 
     * @param reader        the document to extract the values from.  This stream provided by
     *                      the reader must be closed if accessed directly.
     * @return              Returns a map of document property values keyed by property name.
     * @throws              All exception conditions can be handled.
     * 
     * @see #getDefaultMapping()
     */
    protected abstract Map