mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-24 17:32:48 +00:00
Fix AR-487: Extraction of raw metadata is no seperate from the mapping to system properties.
Part fix AR-357: The OfficeMetadataExtracter has been ported, but needs a few more properties added to the raw set git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@5677 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -0,0 +1,688 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2007 Alfresco Software Limited.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
* As a special exception to the terms and conditions of version 2.0 of
|
||||
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||
* FLOSS exception. You should have recieved a copy of the text describing
|
||||
* the FLOSS exception, and it is also available here:
|
||||
* http://www.alfresco.com/legal/licensing"
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import org.alfresco.error.AlfrescoRuntimeException;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.MimetypeService;
|
||||
import org.alfresco.service.namespace.InvalidQNameException;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* Support class for metadata extracters that support dynamic and config-driven
|
||||
* mapping between extracted values and model properties. Extraction is broken
|
||||
* up into two phases:
|
||||
* <ul>
|
||||
* <li>Extract ALL available metadata from the document.</li>
|
||||
* <li>Translate the metadata into system properties.</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* Migrating an existing extracter to use this class is straightforward:
|
||||
* <ul>
|
||||
* <li>
|
||||
* Construct the extracter providing a default set of supported mimetypes to this
|
||||
* implementation. This can be overwritten with configurations.
|
||||
* </li>
|
||||
* <li>
|
||||
* Implement the {@link extractInternal} method. This now returns a raw map of extracted
|
||||
* values keyed by document-specific property names. The <b>trimPut</b> method has
|
||||
* been replaced with an equivalent {@link #putSafeRawValue(String, Object, Map)}.
|
||||
* </li>
|
||||
* <li>
|
||||
* Provide the default mapping of the document-specific properties to system-specific
|
||||
* properties as describe by the {@link #getDefaultMapping()} method. The simplest
|
||||
* is to provide the default mapping in a correlated <i>.properties</i> file.
|
||||
* </li>
|
||||
* <li>
|
||||
* Document, in the class-level javadoc, all the available properties that are extracted
|
||||
* along with their approximate meanings. Add to this, the default mappings.
|
||||
* </li>
|
||||
* </ul>
|
||||
*
|
||||
* @see #getDefaultMapping()
|
||||
* @see #extractRaw(ContentReader)
|
||||
* @see #setMapping(Map)
|
||||
*
|
||||
* @since 2.1
|
||||
*
|
||||
* @author Jesper Steen Møller
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter
|
||||
{
|
||||
public static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix.";
|
||||
|
||||
protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class);
|
||||
|
||||
private MetadataExtracterRegistry registry;
|
||||
private MimetypeService mimetypeService;
|
||||
private long extractionTime;
|
||||
private boolean initialized;
|
||||
|
||||
private Set<String> supportedMimetypes;
|
||||
private OverwritePolicy overwritePolicy;
|
||||
private Map<String, Set<QName>> mapping;
|
||||
private boolean inheritDefaultMapping;
|
||||
|
||||
protected AbstractMappingMetadataExtracter()
|
||||
{
|
||||
this(Collections.<String>emptySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param supportedMimetypes the set of mimetypes supported by default
|
||||
*/
|
||||
protected AbstractMappingMetadataExtracter(Set<String> supportedMimetypes)
|
||||
{
|
||||
this.supportedMimetypes = supportedMimetypes;
|
||||
// Set defaults
|
||||
overwritePolicy = OverwritePolicy.PRAGMATIC;
|
||||
mapping = null; // The default will be fetched
|
||||
inheritDefaultMapping = false; // Any overrides are complete
|
||||
initialized = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the registry to register with. If this is not set, then the default
|
||||
* initialization will not auto-register the extracter for general use. It
|
||||
* can still be used directly.
|
||||
*
|
||||
* @param registry a metadata extracter registry
|
||||
*/
|
||||
public void setRegistry(MetadataExtracterRegistry registry)
|
||||
{
|
||||
this.registry = registry;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param mimetypeService the mimetype service. Set this if required.
|
||||
*/
|
||||
public void setMimetypeService(MimetypeService mimetypeService)
|
||||
{
|
||||
this.mimetypeService = mimetypeService;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the mimetype helper
|
||||
*/
|
||||
protected MimetypeService getMimetypeService()
|
||||
{
|
||||
return mimetypeService;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the mimetypes that are supported by the extracter.
|
||||
*
|
||||
* @param supportedMimetypes
|
||||
*/
|
||||
public void setSupportedMimetypes(Collection<String> supportedMimetypes)
|
||||
{
|
||||
this.supportedMimetypes.clear();
|
||||
this.supportedMimetypes.addAll(supportedMimetypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* @inheritDoc
|
||||
*
|
||||
* @see #setSupportedMimetypes(Collection)
|
||||
*/
|
||||
public boolean isSupported(String sourceMimetype)
|
||||
{
|
||||
return supportedMimetypes.contains(sourceMimetype);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
|
||||
*
|
||||
* @see #isSupported(String)
|
||||
*/
|
||||
public double getReliability(String mimetype)
|
||||
{
|
||||
return isSupported(mimetype) ? 1.0D : 0.0D;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param overwritePolicy the policy to apply when there are existing system properties
|
||||
*/
|
||||
public void setOverwritePolicy(OverwritePolicy overwritePolicy)
|
||||
{
|
||||
this.overwritePolicy = overwritePolicy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set if the property mappings augment or override the mapping generically provided by the
|
||||
* extracter implementation. The default is <tt>false</tt>, i.e. any mapping set completely
|
||||
* replaces the {@link #getDefaultMapping() default mappings}.
|
||||
*
|
||||
* @param inheritDefaultMapping <tt>true</tt> to add the configured mapping
|
||||
* to the list of default mappings.
|
||||
*
|
||||
* @see #getDefaultMapping()
|
||||
* @see #setMapping(Map)
|
||||
* @see #setMappingProperties(Properties)
|
||||
*/
|
||||
public void setInheritDefaultMapping(boolean inheritDefaultMapping)
|
||||
{
|
||||
this.inheritDefaultMapping = inheritDefaultMapping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the mapping from document metadata to system metadata. It is possible to direct
|
||||
* an extracted document property to several system properties. The conversion between
|
||||
* the document property types and the system property types will be done by the
|
||||
* {@link org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter default converter}.
|
||||
*
|
||||
* @param mapping a mapping from document metadata to system metadata
|
||||
*/
|
||||
public void setMapping(Map<String, Set<QName>> mapping)
|
||||
{
|
||||
this.mapping = mapping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the properties that contain the mapping from document metadata to system metadata.
|
||||
* This is an alternative to the {@link #setMapping(Map)} method. Any mappings already
|
||||
* present will be cleared out.
|
||||
*
|
||||
* The property mapping is of the form:
|
||||
* <pre>
|
||||
* # Namespaces prefixes
|
||||
* namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
* namespace.prefix.my=http://www....com/alfresco/1.0
|
||||
*
|
||||
* # Mapping
|
||||
* editor=cm:author, my:editor
|
||||
* title=cm:title
|
||||
* user1=cm:summary
|
||||
* user2=cm:description
|
||||
* </pre>
|
||||
* The mapping can therefore be from a single document property onto several system properties.
|
||||
*
|
||||
* @param mappingProperties the properties that map document properties to system properties
|
||||
*/
|
||||
public void setMappingProperties(Properties mappingProperties)
|
||||
{
|
||||
mapping = readMappingProperties(mappingProperties);
|
||||
}
|
||||
|
||||
/**
|
||||
* A utility method to read mapping properties from a resource file and convert to the map form.
|
||||
*
|
||||
* @param propertiesUrl A standard Properties file URL location
|
||||
*
|
||||
* @see #setMappingProperties(Properties)
|
||||
*/
|
||||
protected Map<String, Set<QName>> readMappingProperties(String propertiesUrl)
|
||||
{
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
is = getClass().getClassLoader().getResourceAsStream(propertiesUrl);
|
||||
if(is == null)
|
||||
{
|
||||
throw new AlfrescoRuntimeException(
|
||||
"Metadata Extracter mapping properties not found: \n" +
|
||||
" Extracter: " + this + "\n" +
|
||||
" Bundle: " + propertiesUrl);
|
||||
}
|
||||
Properties props = new Properties();
|
||||
props.load(is);
|
||||
// Process it
|
||||
Map<String, Set<QName>> map = readMappingProperties(props);
|
||||
// Done
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Loaded mapping properties from resource: " + propertiesUrl);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
throw new AlfrescoRuntimeException(
|
||||
"Unable to load properties file to read extracter mapping properties: \n" +
|
||||
" Extracter: " + this + "\n" +
|
||||
" Bundle: " + propertiesUrl,
|
||||
e);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (Throwable e) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A utility method to convert mapping properties to the Map form.
|
||||
*
|
||||
* @see #setMappingProperties(Properties)
|
||||
*/
|
||||
protected Map<String, Set<QName>> readMappingProperties(Properties mappingProperties)
|
||||
{
|
||||
Map<String, String> namespacesByPrefix = new HashMap<String, String>(5);
|
||||
// Get the namespaces
|
||||
for (Map.Entry entry : mappingProperties.entrySet())
|
||||
{
|
||||
String propertyName = (String) entry.getKey();
|
||||
if (propertyName.startsWith("namespace.prefix."))
|
||||
{
|
||||
String prefix = propertyName.substring(17);
|
||||
String namespace = (String) entry.getValue();
|
||||
namespacesByPrefix.put(prefix, namespace);
|
||||
}
|
||||
}
|
||||
// Create the mapping
|
||||
Map<String, Set<QName>> convertedMapping = new HashMap<String, Set<QName>>(17);
|
||||
for (Map.Entry entry : mappingProperties.entrySet())
|
||||
{
|
||||
String documentProperty = (String) entry.getKey();
|
||||
String qnamesStr = (String) entry.getValue();
|
||||
if (documentProperty.startsWith(NAMESPACE_PROPERTY_PREFIX))
|
||||
{
|
||||
// Ignore these now
|
||||
continue;
|
||||
}
|
||||
// Create the entry
|
||||
Set<QName> qnames = new HashSet<QName>(3);
|
||||
convertedMapping.put(documentProperty, qnames);
|
||||
// The to value can be a list of QNames
|
||||
StringTokenizer tokenizer = new StringTokenizer(qnamesStr, ",");
|
||||
while (tokenizer.hasMoreTokens())
|
||||
{
|
||||
String qnameStr = tokenizer.nextToken().trim();
|
||||
int index = qnameStr.indexOf(QName.NAMESPACE_PREFIX);
|
||||
if (index > -1)
|
||||
{
|
||||
String prefix = qnameStr.substring(0, index);
|
||||
String suffix = qnameStr.substring(index + 1);
|
||||
// It is prefixed
|
||||
String uri = namespacesByPrefix.get(prefix);
|
||||
if (uri == null)
|
||||
{
|
||||
throw new AlfrescoRuntimeException(
|
||||
"No prefix mapping for extracter property mapping: \n" +
|
||||
" Extracter: " + this + "\n" +
|
||||
" Mapping: " + entry);
|
||||
}
|
||||
qnameStr = QName.NAMESPACE_BEGIN + uri + QName.NAMESPACE_END + suffix;
|
||||
}
|
||||
try
|
||||
{
|
||||
QName qname = QName.createQName(qnameStr);
|
||||
// Add it to the mapping
|
||||
qnames.add(qname);
|
||||
}
|
||||
catch (InvalidQNameException e)
|
||||
{
|
||||
throw new AlfrescoRuntimeException(
|
||||
"Can't create metadata extracter property mapping: \n" +
|
||||
" Extracter: " + this + "\n" +
|
||||
" Mapping: " + entry);
|
||||
}
|
||||
}
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Added mapping from " + documentProperty + " to " + qnames);
|
||||
}
|
||||
}
|
||||
// Done
|
||||
return convertedMapping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers this instance of the extracter with the registry. This will call the
|
||||
* {@link #init()} method and then register if the registry is available.
|
||||
*
|
||||
* @see #setRegistry(MetadataExtracterRegistry)
|
||||
* @see #init()
|
||||
*/
|
||||
public final void register()
|
||||
{
|
||||
init();
|
||||
|
||||
// Register the extracter, if necessary
|
||||
if (registry != null)
|
||||
{
|
||||
registry.register(this);
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.warn("No registry provided. Not registering: " + this);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides a hook point for implementations to perform initialization. The base
|
||||
* implementation must be invoked or the extracter will fail during extraction.
|
||||
* The {@link #getDefaultMapping() default mappings} will be requested during
|
||||
* initialization.
|
||||
*/
|
||||
protected void init()
|
||||
{
|
||||
Map<String, Set<QName>> defaultMapping = getDefaultMapping();
|
||||
if (defaultMapping == null)
|
||||
{
|
||||
throw new AlfrescoRuntimeException("The metadata extracter must provide a default mapping: " + this);
|
||||
}
|
||||
|
||||
// Was a mapping explicitly provided
|
||||
if (mapping == null)
|
||||
{
|
||||
// No mapping, so use the default
|
||||
mapping = defaultMapping;
|
||||
}
|
||||
else if (inheritDefaultMapping)
|
||||
{
|
||||
// Merge the default mapping into the configured mapping
|
||||
for (String documentKey : defaultMapping.keySet())
|
||||
{
|
||||
Set<QName> systemQNames = mapping.get(documentKey);
|
||||
if (systemQNames == null)
|
||||
{
|
||||
systemQNames = new HashSet<QName>(3);
|
||||
mapping.put(documentKey, systemQNames);
|
||||
}
|
||||
Set<QName> defaultQNames = defaultMapping.get(documentKey);
|
||||
systemQNames.addAll(defaultQNames);
|
||||
}
|
||||
}
|
||||
|
||||
// Were there any mappings
|
||||
if (mapping.size() == 0)
|
||||
{
|
||||
logger.warn(
|
||||
"There are no property mappings for the metadata extracter.\n" +
|
||||
" Nothing will be extracted by: " + this);
|
||||
}
|
||||
// Done
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
/** @inheritDoc */
|
||||
public long getExtractionTime()
|
||||
{
|
||||
return extractionTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the mimetype is supported.
|
||||
*
|
||||
* @param reader the reader to check
|
||||
* @throws AlfrescoRuntimeException if the mimetype is not supported
|
||||
*/
|
||||
protected void checkIsSupported(ContentReader reader)
|
||||
{
|
||||
String mimetype = reader.getMimetype();
|
||||
if (!isSupported(mimetype))
|
||||
{
|
||||
throw new AlfrescoRuntimeException(
|
||||
"Metadata extracter does not support mimetype: \n" +
|
||||
" reader: " + reader + "\n" +
|
||||
" supported: " + supportedMimetypes + "\n" +
|
||||
" extracter: " + this);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @inheritDoc
|
||||
*/
|
||||
public final boolean extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
return extract(reader, this.overwritePolicy, destination, this.mapping);
|
||||
}
|
||||
|
||||
/**
|
||||
* @inheritDoc
|
||||
*/
|
||||
public final boolean extract(
|
||||
ContentReader reader,
|
||||
OverwritePolicy overwritePolicy,
|
||||
Map<QName, Serializable> destination,
|
||||
Map<String, Set<QName>> mapping) throws ContentIOException
|
||||
{
|
||||
// Done
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Starting metadata extraction: \n" +
|
||||
" reader: " + reader + "\n" +
|
||||
" extracter: " + this);
|
||||
}
|
||||
|
||||
if (!initialized)
|
||||
{
|
||||
throw new AlfrescoRuntimeException(
|
||||
"Metadata extracter not initialized.\n" +
|
||||
" Call the 'register' method on: " + this + "\n" +
|
||||
" Implementations of the 'init' method must call the base implementation.");
|
||||
}
|
||||
// check the reliability
|
||||
checkIsSupported(reader);
|
||||
|
||||
boolean changed = false;
|
||||
try
|
||||
{
|
||||
Map<String, Serializable> rawMetadata = extractRaw(reader);
|
||||
// Convert to system properties (standalone)
|
||||
Map<QName, Serializable> systemProperties = mapRawToSystem(rawMetadata);
|
||||
// Now use the proper overwrite policy
|
||||
changed = overwritePolicy.applyProperties(systemProperties, destination);
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
throw new ContentIOException("Metadata extraction failed: \n" +
|
||||
" reader: " + reader,
|
||||
e);
|
||||
}
|
||||
finally
|
||||
{
|
||||
// check that the reader was closed
|
||||
if (!reader.isClosed())
|
||||
{
|
||||
logger.error("Content reader not closed by metadata extracter: \n" +
|
||||
" reader: " + reader + "\n" +
|
||||
" extracter: " + this);
|
||||
}
|
||||
}
|
||||
|
||||
// Done
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Completed metadata extraction: \n" +
|
||||
" reader: " + reader + "\n" +
|
||||
" extracter: " + this + "\n" +
|
||||
" changed: " + changed);
|
||||
}
|
||||
return changed;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param rawMetadata Metadata keyed by document properties
|
||||
* @return Returns the metadata keyed by the system properties
|
||||
*/
|
||||
private Map<QName, Serializable> mapRawToSystem(Map<String, Serializable> rawMetadata)
|
||||
{
|
||||
Map<QName, Serializable> systemProperties = new HashMap<QName, Serializable>(rawMetadata.size() * 2 + 1);
|
||||
for (Map.Entry<String, Serializable> entry : rawMetadata.entrySet())
|
||||
{
|
||||
String documentKey = entry.getKey();
|
||||
// Check if there is a mapping for this
|
||||
if (!mapping.containsKey(documentKey))
|
||||
{
|
||||
// No mapping - ignore
|
||||
continue;
|
||||
}
|
||||
Serializable documentValue = entry.getValue();
|
||||
Set<QName> systemQNames = mapping.get(documentKey);
|
||||
for (QName systemQName : systemQNames)
|
||||
{
|
||||
systemProperties.put(systemQName, documentValue);
|
||||
}
|
||||
}
|
||||
// Done
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug(
|
||||
"Converted extracted raw values to system values: \n" +
|
||||
" Raw Properties: " + rawMetadata + "\n" +
|
||||
" System Properties: " + systemProperties);
|
||||
}
|
||||
return systemProperties;
|
||||
}
|
||||
|
||||
/**
|
||||
* Examines a value or string for nulls and adds it to the map (if non-empty). If the value
|
||||
* is non-Serializable, then the <code>toString</code> representation used directly.
|
||||
*
|
||||
* @param key the destination map key
|
||||
* @param value the value to check and put.
|
||||
* @param destination map to put values into
|
||||
* @return Returns <tt>true</tt> if set, <tt>false</tt> otherwise
|
||||
*/
|
||||
protected boolean putSafeRawValue(String key, Object value, Map<String, Serializable> destination)
|
||||
{
|
||||
if (value == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (value instanceof String)
|
||||
{
|
||||
String svalue = ((String) value).trim();
|
||||
if (svalue.length() > 0)
|
||||
{
|
||||
destination.put(key, svalue);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
else if (value instanceof Serializable)
|
||||
{
|
||||
destination.put(key, (Serializable) value);
|
||||
}
|
||||
else
|
||||
{
|
||||
destination.put(key, value.toString());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method provides a <i>best guess</i> of where to store the values extracted
|
||||
* from the documents. The list of properties mapped by default need <b>not</b>
|
||||
* include all properties extracted from the document; just the obvious set of mappings
|
||||
* need be supplied.
|
||||
* Implementations must either provide the default mapping properties in the expected
|
||||
* location or override the method to provide the default mapping.
|
||||
* <p>
|
||||
* The default implementation looks for the default mapping file in the location
|
||||
* given by the class name and <i>.properties</i>. If the extracter's class is
|
||||
* <b>x.y.z.MyExtracter</b> then the default properties will be picked up at
|
||||
* <b>classpath:/x/y/z/MyExtracter.properties</b>.
|
||||
* Inner classes are supported, but the '$' in the class name is replaced with '-', so
|
||||
* default properties for <b>x.y.z.MyStuff$MyExtracter</b> will be located using
|
||||
* <b>x.y.z.MyStuff-MyExtracter.properties</b>.
|
||||
* <p>
|
||||
* The default mapping implementation should include thorough Javadocs so that the
|
||||
* system administrators can accurately determine how to best enhance or override the
|
||||
* default mapping.
|
||||
* <p>
|
||||
* If the default mapping is declared in a properties file, then the
|
||||
* {@link #readMappingProperties(String)} method can be used to quickly generate the
|
||||
* return value:
|
||||
* <pre>
|
||||
* protected Map<String, Set<QName>> getDefaultMapping()
|
||||
* {
|
||||
* return readMappingProperties(DEFAULT_MAPPING);
|
||||
* }
|
||||
* </pre>
|
||||
* The map can also be created in code either statically or during the call.
|
||||
*
|
||||
* @return Returns the default, static mapping. It may not be null.
|
||||
*
|
||||
* @see #setInheritDefaultMapping(boolean inherit)
|
||||
*/
|
||||
protected Map<String, Set<QName>> getDefaultMapping()
|
||||
{
|
||||
String className = this.getClass().getName();
|
||||
// Replace $
|
||||
className = className.replace('$', '-');
|
||||
// Replace .
|
||||
className = className.replace('.', '/');
|
||||
// Append .properties
|
||||
String propertiesUrl = className + ".properties";
|
||||
// Attempt to load the properties
|
||||
return readMappingProperties(propertiesUrl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Override to provide the raw extracted metadata values. An extracter should extract
|
||||
* as many of the available properties as is realistically possible. Even if the
|
||||
* {@link #getDefaultMapping() default mapping} doesn't handle all properties, it is
|
||||
* possible for each instance of the extracter to be configured differently and more or
|
||||
* less of the properties may be used in different installations.
|
||||
* <p>
|
||||
* Raw values must not be trimmed or removed for any reason. Null values and empty
|
||||
* strings are
|
||||
* <ul>
|
||||
* <li><b>Null:</b> Removed</li>
|
||||
* <li><b>Emtpty String:</b> Passed to the {@link OverwritePolicy}</li>
|
||||
* <li><b>Non Serializable:</b> Converted to String or fails if that is not possible</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* Properties extracted and their meanings and types should be thoroughly described in
|
||||
* the class-level javadocs of the extracter implementation, for example:
|
||||
* <pre>
|
||||
* <b>editor:</b> - the document editor --> cm:author
|
||||
* <b>title:</b> - the document title --> cm:title
|
||||
* <b>user1:</b> - the document summary
|
||||
* <b>user2:</b> - the document description --> cm:description
|
||||
* <b>user3:</b> -
|
||||
* <b>user4:</b> -
|
||||
* </pre>
|
||||
*
|
||||
* @param reader the document to extract the values from. This stream provided by
|
||||
* the reader must be closed if accessed directly.
|
||||
* @return Returns a map of document property values keyed by property name.
|
||||
* @throws All exception conditions can be handled.
|
||||
*
|
||||
* @see #getDefaultMapping()
|
||||
*/
|
||||
protected abstract Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable;
|
||||
}
|
@@ -0,0 +1,250 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2007 Alfresco Software Limited.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
* As a special exception to the terms and conditions of version 2.0 of
|
||||
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||
* FLOSS exception. You should have recieved a copy of the text describing
|
||||
* the FLOSS exception, and it is also available here:
|
||||
* http://www.alfresco.com/legal/licensing"
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.repo.content.filestore.FileContentReader;
|
||||
import org.alfresco.repo.content.metadata.MetadataExtracter.OverwritePolicy;
|
||||
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter
|
||||
*
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public class AbstractMappingMetadataExtracterTest extends TestCase
|
||||
{
|
||||
private DummyMappingMetadataExtracter extracter;
|
||||
private ContentReader reader;
|
||||
private Map<QName, Serializable> destination;
|
||||
|
||||
@Override
|
||||
protected void setUp() throws Exception
|
||||
{
|
||||
extracter = new DummyMappingMetadataExtracter();
|
||||
extracter.register();
|
||||
reader = new FileContentReader(AbstractContentTransformerTest.loadQuickTestFile("txt"));
|
||||
reader.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
|
||||
destination = new HashMap<QName, Serializable>(7);
|
||||
destination.put(DummyMappingMetadataExtracter.QNAME_A1, JunkValue.INSTANCE);
|
||||
destination.put(DummyMappingMetadataExtracter.QNAME_A2, "");
|
||||
destination.put(DummyMappingMetadataExtracter.QNAME_B, null);
|
||||
}
|
||||
|
||||
public void testSetUp()
|
||||
{
|
||||
assertNotNull(reader);
|
||||
assertNotNull(extracter);
|
||||
assertTrue("Extracter not initialized.", extracter.initCheck);
|
||||
}
|
||||
|
||||
public void testDefaultExtract() throws Exception
|
||||
{
|
||||
destination.clear();
|
||||
extracter.extract(reader, destination);
|
||||
assertEquals(3, destination.size());
|
||||
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A1));
|
||||
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A2));
|
||||
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_B));
|
||||
}
|
||||
|
||||
public void testPropertyMappingOverride() throws Exception
|
||||
{
|
||||
Properties props = new Properties();
|
||||
props.put("namespace.prefix.my", DummyMappingMetadataExtracter.NAMESPACE_MY);
|
||||
props.put(DummyMappingMetadataExtracter.PROP_A, " my:a1, my:a2 ");
|
||||
extracter.setMappingProperties(props);
|
||||
extracter.register();
|
||||
// Only mapped 'a'
|
||||
destination.clear();
|
||||
extracter.extract(reader, destination);
|
||||
assertEquals(2, destination.size());
|
||||
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A1));
|
||||
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A2));
|
||||
}
|
||||
|
||||
public void testPropertyMappingMerge() throws Exception
|
||||
{
|
||||
Properties props = new Properties();
|
||||
props.put("namespace.prefix.my", DummyMappingMetadataExtracter.NAMESPACE_MY);
|
||||
props.put(DummyMappingMetadataExtracter.PROP_A, " my:a3 ");
|
||||
extracter.setMappingProperties(props);
|
||||
extracter.setInheritDefaultMapping(true);
|
||||
extracter.register();
|
||||
// Added a3
|
||||
destination.clear();
|
||||
extracter.extract(reader, destination);
|
||||
assertEquals(4, destination.size());
|
||||
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A1));
|
||||
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A2));
|
||||
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A3));
|
||||
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_B));
|
||||
}
|
||||
|
||||
public void testPropertyMappingOverrideExtra() throws Exception
|
||||
{
|
||||
Properties props = new Properties();
|
||||
props.put("namespace.prefix.my", DummyMappingMetadataExtracter.NAMESPACE_MY);
|
||||
props.put(DummyMappingMetadataExtracter.PROP_C, " my:c ");
|
||||
props.put(DummyMappingMetadataExtracter.PROP_D, " my:d ");
|
||||
props.put(DummyMappingMetadataExtracter.PROP_E, " my:e ");
|
||||
extracter.setMappingProperties(props);
|
||||
extracter.register();
|
||||
// Added a3
|
||||
destination.clear();
|
||||
extracter.extract(reader, destination);
|
||||
assertEquals(2, destination.size());
|
||||
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_C));
|
||||
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_D));
|
||||
}
|
||||
|
||||
public void testOverwritePolicyEager()
|
||||
{
|
||||
extracter.setOverwritePolicy(OverwritePolicy.EAGER);
|
||||
extracter.extract(reader, destination);
|
||||
assertEquals(3, destination.size());
|
||||
assertEquals(DummyMappingMetadataExtracter.VALUE_A, destination.get(DummyMappingMetadataExtracter.QNAME_A1));
|
||||
assertEquals(DummyMappingMetadataExtracter.VALUE_A, destination.get(DummyMappingMetadataExtracter.QNAME_A2));
|
||||
assertEquals(DummyMappingMetadataExtracter.VALUE_B, destination.get(DummyMappingMetadataExtracter.QNAME_B));
|
||||
}
|
||||
|
||||
public void testOverwritePolicyPragmatic()
|
||||
{
|
||||
extracter.setOverwritePolicy(OverwritePolicy.PRAGMATIC);
|
||||
extracter.extract(reader, destination);
|
||||
assertEquals(3, destination.size());
|
||||
assertEquals(JunkValue.INSTANCE, destination.get(DummyMappingMetadataExtracter.QNAME_A1));
|
||||
assertEquals(DummyMappingMetadataExtracter.VALUE_A, destination.get(DummyMappingMetadataExtracter.QNAME_A2));
|
||||
assertEquals(DummyMappingMetadataExtracter.VALUE_B, destination.get(DummyMappingMetadataExtracter.QNAME_B));
|
||||
}
|
||||
|
||||
public void testOverwritePolicyCautious()
|
||||
{
|
||||
extracter.setOverwritePolicy(OverwritePolicy.CAUTIOUS);
|
||||
extracter.extract(reader, destination);
|
||||
assertEquals(3, destination.size());
|
||||
assertEquals(JunkValue.INSTANCE, destination.get(DummyMappingMetadataExtracter.QNAME_A1));
|
||||
assertEquals("", destination.get(DummyMappingMetadataExtracter.QNAME_A2));
|
||||
assertEquals(null, destination.get(DummyMappingMetadataExtracter.QNAME_B));
|
||||
}
|
||||
|
||||
/**
|
||||
* A spoofed-up extracter that extracts the following:
|
||||
* <pre>
|
||||
* <b>a:</b> - A --> my:a1, my:a2
|
||||
* <b>b:</b> - B --> my:b
|
||||
* <b>c:</b> - C
|
||||
* <b>d:</b> - D
|
||||
* </pre>
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public static class DummyMappingMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
{
|
||||
public static final String PROP_A = "a";
|
||||
public static final String PROP_B = "b";
|
||||
public static final String PROP_C = "c";
|
||||
public static final String PROP_D = "d";
|
||||
public static final String PROP_E = "e";
|
||||
public static final String VALUE_A = "AAA";
|
||||
public static final String VALUE_B = "BBB";
|
||||
public static final String VALUE_C = "CCC";
|
||||
public static final String VALUE_D = "DDD";
|
||||
|
||||
public static final String NAMESPACE_MY = "http://DummyMappingMetadataExtracter";
|
||||
public static final QName QNAME_A1 = QName.createQName(NAMESPACE_MY, "a1");
|
||||
public static final QName QNAME_A2 = QName.createQName(NAMESPACE_MY, "a2");
|
||||
public static final QName QNAME_A3 = QName.createQName(NAMESPACE_MY, "a3");
|
||||
public static final QName QNAME_B = QName.createQName(NAMESPACE_MY, "b");
|
||||
public static final QName QNAME_C = QName.createQName(NAMESPACE_MY, "c");
|
||||
public static final QName QNAME_D = QName.createQName(NAMESPACE_MY, "d");
|
||||
public static final QName QNAME_E = QName.createQName(NAMESPACE_MY, "e"); // not extracted
|
||||
private static final Set<String> MIMETYPES;
|
||||
static
|
||||
{
|
||||
MIMETYPES = new HashSet<String>(5);
|
||||
MIMETYPES.add(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
MIMETYPES.add(MimetypeMap.MIMETYPE_XML);
|
||||
}
|
||||
|
||||
Map<String, Set<QName>> defaultMapping;
|
||||
private boolean initCheck;
|
||||
|
||||
public DummyMappingMetadataExtracter()
|
||||
{
|
||||
super(MIMETYPES);
|
||||
initCheck = false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void init()
|
||||
{
|
||||
defaultMapping = new HashMap<String, Set<QName>>(7);
|
||||
defaultMapping.put(PROP_A, new HashSet<QName>(Arrays.asList(QNAME_A1, QNAME_A2)));
|
||||
defaultMapping.put(PROP_B, new HashSet<QName>(Arrays.asList(QNAME_B)));
|
||||
|
||||
initCheck = true;
|
||||
|
||||
super.init();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Set<QName>> getDefaultMapping()
|
||||
{
|
||||
return defaultMapping;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractRaw(ContentReader reader)
|
||||
{
|
||||
reader.getContentString();
|
||||
|
||||
Map<String, Serializable> ret = new HashMap<String, Serializable>(7);
|
||||
ret.put(PROP_A, VALUE_A);
|
||||
ret.put(PROP_B, VALUE_B);
|
||||
ret.put(PROP_C, VALUE_C);
|
||||
ret.put(PROP_D, VALUE_D);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
private static class JunkValue implements Serializable
|
||||
{
|
||||
private static final JunkValue INSTANCE = new JunkValue();
|
||||
private static final long serialVersionUID = 1L;
|
||||
}
|
||||
}
|
@@ -38,6 +38,9 @@ import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* Support class for metadata extracters.
|
||||
*
|
||||
* @deprecated Use the {@link org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter}
|
||||
*
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
@@ -123,6 +126,18 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @inheritDoc
|
||||
*
|
||||
* @return Returns <tt>true</tt> if the {@link #getReliability(String) reliability}
|
||||
* is greater than 0
|
||||
*/
|
||||
public boolean isSupported(String mimetype)
|
||||
{
|
||||
double reliability = getReliability(mimetype);
|
||||
return reliability > 0.0;
|
||||
}
|
||||
|
||||
public long getExtractionTime()
|
||||
{
|
||||
return extractionTime;
|
||||
@@ -147,7 +162,10 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter
|
||||
}
|
||||
}
|
||||
|
||||
public final void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
/**
|
||||
* @inheritDoc
|
||||
*/
|
||||
public boolean extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
// check the reliability
|
||||
checkReliability(reader);
|
||||
@@ -180,14 +198,24 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter
|
||||
" reader: " + reader + "\n" +
|
||||
" extracter: " + this);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public final void extract(
|
||||
/**
|
||||
* @inheritDoc
|
||||
*
|
||||
* @param overwritePolicy ignored
|
||||
* @param propertyMapping ignored
|
||||
*
|
||||
* @see #extract(ContentReader, Map)
|
||||
*/
|
||||
public final boolean extract(
|
||||
ContentReader reader,
|
||||
OverwritePolicy overwritePolicy,
|
||||
Map<QName, Serializable> destination,
|
||||
Map<String, QName> propertyMapping) throws ContentIOException
|
||||
Map<String, Set<QName>> propertyMapping) throws ContentIOException
|
||||
{
|
||||
throw new UnsupportedOperationException();
|
||||
return extract(reader, destination);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -197,12 +225,13 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter
|
||||
* @param reader the source of the content
|
||||
* @param destination the property map to fill
|
||||
* @throws Throwable an exception
|
||||
*
|
||||
* @deprecated Consider deriving from the more configurable {@link AbstractMappingMetadataExtracter}
|
||||
*/
|
||||
protected abstract void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable;
|
||||
|
||||
/**
|
||||
* Examines a value or string for nulls and adds it to the map (if
|
||||
* non-empty)
|
||||
* Examines a value or string for nulls and adds it to the map (if non-empty)
|
||||
*
|
||||
* @param prop Alfresco's <code>ContentModel.PROP_</code> to set.
|
||||
* @param value Value to set it to
|
||||
|
@@ -26,6 +26,7 @@ package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
@@ -40,16 +41,171 @@ import org.alfresco.service.namespace.QName;
|
||||
public interface MetadataExtracter
|
||||
{
|
||||
/**
|
||||
* Provides the approximate accuracy with which this extracter can extract
|
||||
* metadata for the mimetype.
|
||||
* <p>
|
||||
* A enumeration of functional property overwrite policies. These determine whether extracted properties are
|
||||
* written into the property map or not.
|
||||
*
|
||||
* @param sourceMimetype the source mimetype
|
||||
* @return Returns a score 0.0 to 1.0. 0.0 indicates that the extraction
|
||||
* cannot be performed at all. 1.0 indicates that the extraction can
|
||||
* be performed perfectly.
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public double getReliability(String sourceMimetype);
|
||||
public enum OverwritePolicy
|
||||
{
|
||||
/**
|
||||
* This policy puts the new value if:
|
||||
* <ul>
|
||||
* <li>the extracted property is not null</li>
|
||||
* </ul>
|
||||
*/
|
||||
EAGER
|
||||
{
|
||||
@Override
|
||||
public boolean applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
|
||||
{
|
||||
boolean modified = false;
|
||||
for (Map.Entry<QName, Serializable> entry : extractedProperties.entrySet())
|
||||
{
|
||||
QName propertyQName = entry.getKey();
|
||||
Serializable extractedValue = entry.getValue();
|
||||
// Ignore null extracted value
|
||||
if (extractedValue == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
targetProperties.put(propertyQName, extractedValue);
|
||||
modified = true;
|
||||
}
|
||||
return modified;
|
||||
}
|
||||
},
|
||||
/**
|
||||
* This policy puts the new value if:
|
||||
* <ul>
|
||||
* <li>the extracted property is not null</li>
|
||||
* <li>there is no target key for the property</li>
|
||||
* <li>the target value is null</li>
|
||||
* <li>the string representation of the target value is an empty string</li>
|
||||
* </ul>
|
||||
*/
|
||||
PRAGMATIC
|
||||
{
|
||||
@Override
|
||||
public boolean applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
|
||||
{
|
||||
/*
|
||||
* Negative and positive checks are mixed in the loop.
|
||||
*/
|
||||
boolean modified = false;
|
||||
for (Map.Entry<QName, Serializable> entry : extractedProperties.entrySet())
|
||||
{
|
||||
QName propertyQName = entry.getKey();
|
||||
Serializable extractedValue = entry.getValue();
|
||||
// Ignore null extracted value
|
||||
if (extractedValue == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// Handle the shortcut cases where the target value is missing or null
|
||||
if (!targetProperties.containsKey(propertyQName))
|
||||
{
|
||||
// There is nothing currently
|
||||
targetProperties.put(propertyQName, extractedValue);
|
||||
modified = true;
|
||||
continue;
|
||||
}
|
||||
Serializable originalValue = targetProperties.get(propertyQName);
|
||||
if (originalValue == null)
|
||||
{
|
||||
// The current value is null
|
||||
targetProperties.put(propertyQName, extractedValue);
|
||||
modified = true;
|
||||
continue;
|
||||
}
|
||||
// Check the string representation
|
||||
if (originalValue instanceof String)
|
||||
{
|
||||
String originalValueStr = (String) originalValue;
|
||||
if (originalValueStr != null && originalValueStr.length() > 0)
|
||||
{
|
||||
// The original value is non-trivial
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
// The original string is trivial
|
||||
targetProperties.put(propertyQName, extractedValue);
|
||||
modified = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// We have some other object as the original value, so keep it
|
||||
}
|
||||
return modified;
|
||||
}
|
||||
},
|
||||
/**
|
||||
* This policy only puts the extracted value if there is no value (null or otherwise) in the properties map.
|
||||
* It is assumed that the mere presence of a property key is enough to inidicate that the target property
|
||||
* is as intented.
|
||||
* This policy puts the new value if:
|
||||
* <ul>
|
||||
* <li>the extracted property is not null</li>
|
||||
* <li>there is no target key for the property</li>
|
||||
* </ul>
|
||||
*/
|
||||
CAUTIOUS
|
||||
{
|
||||
@Override
|
||||
public boolean applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
|
||||
{
|
||||
boolean modified = false;
|
||||
for (Map.Entry<QName, Serializable> entry : extractedProperties.entrySet())
|
||||
{
|
||||
QName propertyQName = entry.getKey();
|
||||
Serializable extractedValue = entry.getValue();
|
||||
// Ignore null extracted value
|
||||
if (extractedValue == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// Is the key present in the target values
|
||||
if (targetProperties.containsKey(propertyQName))
|
||||
{
|
||||
// Cautiously bypass the value as there is one already
|
||||
continue;
|
||||
}
|
||||
targetProperties.put(propertyQName, extractedValue);
|
||||
modified = true;
|
||||
}
|
||||
return modified;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Apply the overwrite policy for the extracted properties.
|
||||
*
|
||||
* @return Returns true if <i>any</i> properties were set on the target properties
|
||||
*/
|
||||
public boolean applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
|
||||
{
|
||||
throw new UnsupportedOperationException("Override this method");
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Get an estimate of the extracter's reliability on a scale from 0.0 to 1.0.
|
||||
*
|
||||
* @param mimetype the mimetype to check
|
||||
* @return Returns a reliability indicator from 0.0 to 1.0
|
||||
*
|
||||
* @deprecated This method is replaced by {@link #isSupported(String)}
|
||||
*/
|
||||
public double getReliability(String mimetype);
|
||||
|
||||
/**
|
||||
* Determines if the extracter works against the given mimetype.
|
||||
*
|
||||
* @param mimetype the document mimetype
|
||||
* @return Returns <tt>true</tt> if the mimetype is supported, otherwise <tt>false</tt>.
|
||||
*/
|
||||
public boolean isSupported(String mimetype);
|
||||
|
||||
/**
|
||||
* Provides an estimate, usually a worst case guess, of how long an
|
||||
@@ -63,41 +219,51 @@ public interface MetadataExtracter
|
||||
public long getExtractionTime();
|
||||
|
||||
/**
|
||||
* Extracts the metadata from the content provided by the reader and source
|
||||
* mimetype to the supplied map.
|
||||
* Extracts the metadata values from the content provided by the reader and source
|
||||
* mimetype to the supplied map. The internal mapping and {@link OverwritePolicy overwrite policy}
|
||||
* between document metadata and system metadata will be used.
|
||||
* <p>
|
||||
* The extraction viability can be determined by an up front call to
|
||||
* {@link #getReliability(String)}.
|
||||
* The extraction viability can be determined by an up front call to {@link #isSupported(String)}.
|
||||
* <p>
|
||||
* The source mimetype <b>must</b> be available on the
|
||||
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
|
||||
* of the reader.
|
||||
* <p>
|
||||
* <b>Note:</b> Internally, the extracter may need to perform a mapping of document-specific
|
||||
* properties to <code>QName</code>. This is an implementation detail that is
|
||||
* supported in the default abstract implementations.
|
||||
*
|
||||
* @param reader the source of the content
|
||||
* @param destination the map of properties to populate (essentially a return value)
|
||||
* @return Returns <tt>true</tt> if the destination map was modified
|
||||
* @throws ContentIOException if a detectable error occurs
|
||||
*
|
||||
* @see #extract(ContentReader, Map, Map)
|
||||
* @see #extract(ContentReader, OverwritePolicy, Map, Map)
|
||||
*/
|
||||
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException;
|
||||
public boolean extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException;
|
||||
|
||||
/**
|
||||
*
|
||||
* Extracts the metadata from the content provided by the reader and source
|
||||
* mimetype to the supplied map. The mapping from document metadata to system metadata
|
||||
* is explicitly provided. The {@link OverwritePolicy overwrite policy} is also explictly
|
||||
* set.
|
||||
* <p>
|
||||
* The extraction viability can be determined by an up front call to
|
||||
* {@link #isSupported(String)}.
|
||||
* <p>
|
||||
* The source mimetype <b>must</b> be available on the
|
||||
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
|
||||
* of the reader.
|
||||
*
|
||||
* @param reader the source of the content
|
||||
* @param overwritePolicy the policy stipulating how the system properties must be
|
||||
* overwritten if present
|
||||
* @param destination the map of properties to populate (essentially a return value)
|
||||
* @param propertyMapping a mapping of internal (document-specific properties) to system
|
||||
* properties.
|
||||
* @param mapping a mapping of document-specific properties to system properties.
|
||||
* @return Returns <tt>true</tt> if the destination map was modified
|
||||
* @throws ContentIOException if a detectable error occurs
|
||||
*
|
||||
* @see #extract(ContentReader, Map)
|
||||
*/
|
||||
public void extract(
|
||||
public boolean extract(
|
||||
ContentReader reader,
|
||||
OverwritePolicy overwritePolicy,
|
||||
Map<QName, Serializable> destination,
|
||||
Map<String, QName> propertyMapping) throws ContentIOException;
|
||||
Map<String, Set<QName>> mapping) throws ContentIOException;
|
||||
}
|
||||
|
@@ -28,14 +28,13 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.poi.hpsf.PropertySet;
|
||||
import org.apache.poi.hpsf.PropertySetFactory;
|
||||
import org.apache.poi.hpsf.SummaryInformation;
|
||||
@@ -47,9 +46,16 @@ import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
|
||||
* Office file format Metadata Extracter
|
||||
*
|
||||
* @author Jesper Steen Møller
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public class OfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
{
|
||||
public static final String PROP_AUTHOR = "author";
|
||||
public static final String PROP_TITLE = "title";
|
||||
public static final String PROP_SUBJECT = "subject";
|
||||
public static final String PROP_CREATE_DATETIME = "createDateTime";
|
||||
public static final String PROP_LAST_SAVE_DATETIME = "lastSaveDateTime";
|
||||
|
||||
public static String[] SUPPORTED_MIMETYPES = new String[] {
|
||||
MimetypeMap.MIMETYPE_WORD,
|
||||
MimetypeMap.MIMETYPE_EXCEL,
|
||||
@@ -57,11 +63,14 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
|
||||
public OfficeMetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)), 1.0, 1000);
|
||||
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
|
||||
}
|
||||
|
||||
public void extractInternal(ContentReader reader, final Map<QName, Serializable> destination) throws Throwable
|
||||
@Override
|
||||
protected Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
||||
{
|
||||
final Map<String, Serializable> rawProperties = new HashMap<String, Serializable>(17);
|
||||
|
||||
POIFSReaderListener readerListener = new POIFSReaderListener()
|
||||
{
|
||||
public void processPOIFSReaderEvent(final POIFSReaderEvent event)
|
||||
@@ -73,14 +82,11 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
SummaryInformation si = (SummaryInformation) ps;
|
||||
|
||||
// Titled aspect
|
||||
trimPut(ContentModel.PROP_TITLE, si.getTitle(), destination);
|
||||
trimPut(ContentModel.PROP_DESCRIPTION, si.getSubject(), destination);
|
||||
|
||||
// Auditable aspect
|
||||
trimPut(ContentModel.PROP_CREATED, si.getCreateDateTime(), destination);
|
||||
trimPut(ContentModel.PROP_MODIFIED, si.getLastSaveDateTime(), destination);
|
||||
trimPut(ContentModel.PROP_AUTHOR, si.getAuthor(), destination);
|
||||
putSafeRawValue(PROP_AUTHOR, si.getAuthor(), rawProperties);
|
||||
putSafeRawValue(PROP_TITLE, si.getTitle(), rawProperties);
|
||||
putSafeRawValue(PROP_SUBJECT, si.getSubject(), rawProperties);
|
||||
putSafeRawValue(PROP_CREATE_DATETIME, si.getCreateDateTime(), rawProperties);
|
||||
putSafeRawValue(PROP_LAST_SAVE_DATETIME, si.getLastSaveDateTime(), rawProperties);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
@@ -105,5 +111,6 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
try { is.close(); } catch (IOException e) {}
|
||||
}
|
||||
}
|
||||
return rawProperties;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,14 @@
|
||||
#
|
||||
# OfficeMetadataExtracter - default mapping
|
||||
#
|
||||
# author: Derek Hulley
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
subject=cm:description
|
||||
createDateTime=cm:created
|
||||
lastSaveDateTime=cm:modified
|
@@ -8,13 +8,14 @@ package org.alfresco.repo.content.metadata;
|
||||
*/
|
||||
public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private MetadataExtracter extracter;
|
||||
private OfficeMetadataExtracter extracter;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception
|
||||
{
|
||||
super.setUp();
|
||||
extracter = new OfficeMetadataExtracter();
|
||||
extracter.register();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@@ -99,7 +99,7 @@ public abstract class AbstractContentTransformerTest extends BaseSpringTest
|
||||
* Helper method to load one of the "The quick brown fox" files from the
|
||||
* classpath.
|
||||
*
|
||||
* @param extension the extension of the file required
|
||||
* @param extension the extension of the file required, e.g. <b>txt</b>
|
||||
* @return Returns a test resource loaded from the classpath or <tt>null</tt> if
|
||||
* no resource could be found.
|
||||
* @throws IOException
|
||||
|
Reference in New Issue
Block a user