Fix AR-487: Extraction of raw metadata is no seperate from the mapping to system properties.

Part fix AR-357: The OfficeMetadataExtracter has been ported, but needs a few more properties added to the raw set


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@5677 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Derek Hulley
2007-05-15 08:48:07 +00:00
parent e637299ed8
commit 0e51d23b29
8 changed files with 1199 additions and 44 deletions

View File

@@ -0,0 +1,688 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.metadata;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.StringTokenizer;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.MimetypeService;
import org.alfresco.service.namespace.InvalidQNameException;
import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Support class for metadata extracters that support dynamic and config-driven
* mapping between extracted values and model properties. Extraction is broken
* up into two phases:
* <ul>
* <li>Extract ALL available metadata from the document.</li>
* <li>Translate the metadata into system properties.</li>
* </ul>
* <p>
* Migrating an existing extracter to use this class is straightforward:
* <ul>
* <li>
* Construct the extracter providing a default set of supported mimetypes to this
* implementation. This can be overwritten with configurations.
* </li>
* <li>
* Implement the {@link extractInternal} method. This now returns a raw map of extracted
* values keyed by document-specific property names. The <b>trimPut</b> method has
* been replaced with an equivalent {@link #putSafeRawValue(String, Object, Map)}.
* </li>
* <li>
* Provide the default mapping of the document-specific properties to system-specific
* properties as describe by the {@link #getDefaultMapping()} method. The simplest
* is to provide the default mapping in a correlated <i>.properties</i> file.
* </li>
* <li>
* Document, in the class-level javadoc, all the available properties that are extracted
* along with their approximate meanings. Add to this, the default mappings.
* </li>
* </ul>
*
* @see #getDefaultMapping()
* @see #extractRaw(ContentReader)
* @see #setMapping(Map)
*
* @since 2.1
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/
abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter
{
public static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix.";
protected static Log logger = LogFactory.getLog(AbstractMappingMetadataExtracter.class);
private MetadataExtracterRegistry registry;
private MimetypeService mimetypeService;
private long extractionTime;
private boolean initialized;
private Set<String> supportedMimetypes;
private OverwritePolicy overwritePolicy;
private Map<String, Set<QName>> mapping;
private boolean inheritDefaultMapping;
protected AbstractMappingMetadataExtracter()
{
this(Collections.<String>emptySet());
}
/**
* @param supportedMimetypes the set of mimetypes supported by default
*/
protected AbstractMappingMetadataExtracter(Set<String> supportedMimetypes)
{
this.supportedMimetypes = supportedMimetypes;
// Set defaults
overwritePolicy = OverwritePolicy.PRAGMATIC;
mapping = null; // The default will be fetched
inheritDefaultMapping = false; // Any overrides are complete
initialized = false;
}
/**
* Set the registry to register with. If this is not set, then the default
* initialization will not auto-register the extracter for general use. It
* can still be used directly.
*
* @param registry a metadata extracter registry
*/
public void setRegistry(MetadataExtracterRegistry registry)
{
this.registry = registry;
}
/**
* @param mimetypeService the mimetype service. Set this if required.
*/
public void setMimetypeService(MimetypeService mimetypeService)
{
this.mimetypeService = mimetypeService;
}
/**
* @return Returns the mimetype helper
*/
protected MimetypeService getMimetypeService()
{
return mimetypeService;
}
/**
* Set the mimetypes that are supported by the extracter.
*
* @param supportedMimetypes
*/
public void setSupportedMimetypes(Collection<String> supportedMimetypes)
{
this.supportedMimetypes.clear();
this.supportedMimetypes.addAll(supportedMimetypes);
}
/**
* @inheritDoc
*
* @see #setSupportedMimetypes(Collection)
*/
public boolean isSupported(String sourceMimetype)
{
return supportedMimetypes.contains(sourceMimetype);
}
/**
* @return Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
*
* @see #isSupported(String)
*/
public double getReliability(String mimetype)
{
return isSupported(mimetype) ? 1.0D : 0.0D;
}
/**
* @param overwritePolicy the policy to apply when there are existing system properties
*/
public void setOverwritePolicy(OverwritePolicy overwritePolicy)
{
this.overwritePolicy = overwritePolicy;
}
/**
* Set if the property mappings augment or override the mapping generically provided by the
* extracter implementation. The default is <tt>false</tt>, i.e. any mapping set completely
* replaces the {@link #getDefaultMapping() default mappings}.
*
* @param inheritDefaultMapping <tt>true</tt> to add the configured mapping
* to the list of default mappings.
*
* @see #getDefaultMapping()
* @see #setMapping(Map)
* @see #setMappingProperties(Properties)
*/
public void setInheritDefaultMapping(boolean inheritDefaultMapping)
{
this.inheritDefaultMapping = inheritDefaultMapping;
}
/**
* Set the mapping from document metadata to system metadata. It is possible to direct
* an extracted document property to several system properties. The conversion between
* the document property types and the system property types will be done by the
* {@link org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter default converter}.
*
* @param mapping a mapping from document metadata to system metadata
*/
public void setMapping(Map<String, Set<QName>> mapping)
{
this.mapping = mapping;
}
/**
* Set the properties that contain the mapping from document metadata to system metadata.
* This is an alternative to the {@link #setMapping(Map)} method. Any mappings already
* present will be cleared out.
*
* The property mapping is of the form:
* <pre>
* # Namespaces prefixes
* namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
* namespace.prefix.my=http://www....com/alfresco/1.0
*
* # Mapping
* editor=cm:author, my:editor
* title=cm:title
* user1=cm:summary
* user2=cm:description
* </pre>
* The mapping can therefore be from a single document property onto several system properties.
*
* @param mappingProperties the properties that map document properties to system properties
*/
public void setMappingProperties(Properties mappingProperties)
{
mapping = readMappingProperties(mappingProperties);
}
/**
* A utility method to read mapping properties from a resource file and convert to the map form.
*
* @param propertiesUrl A standard Properties file URL location
*
* @see #setMappingProperties(Properties)
*/
protected Map<String, Set<QName>> readMappingProperties(String propertiesUrl)
{
InputStream is = null;
try
{
is = getClass().getClassLoader().getResourceAsStream(propertiesUrl);
if(is == null)
{
throw new AlfrescoRuntimeException(
"Metadata Extracter mapping properties not found: \n" +
" Extracter: " + this + "\n" +
" Bundle: " + propertiesUrl);
}
Properties props = new Properties();
props.load(is);
// Process it
Map<String, Set<QName>> map = readMappingProperties(props);
// Done
if (logger.isDebugEnabled())
{
logger.debug("Loaded mapping properties from resource: " + propertiesUrl);
}
return map;
}
catch (Throwable e)
{
throw new AlfrescoRuntimeException(
"Unable to load properties file to read extracter mapping properties: \n" +
" Extracter: " + this + "\n" +
" Bundle: " + propertiesUrl,
e);
}
finally
{
if (is != null)
{
try { is.close(); } catch (Throwable e) {}
}
}
}
/**
* A utility method to convert mapping properties to the Map form.
*
* @see #setMappingProperties(Properties)
*/
protected Map<String, Set<QName>> readMappingProperties(Properties mappingProperties)
{
Map<String, String> namespacesByPrefix = new HashMap<String, String>(5);
// Get the namespaces
for (Map.Entry entry : mappingProperties.entrySet())
{
String propertyName = (String) entry.getKey();
if (propertyName.startsWith("namespace.prefix."))
{
String prefix = propertyName.substring(17);
String namespace = (String) entry.getValue();
namespacesByPrefix.put(prefix, namespace);
}
}
// Create the mapping
Map<String, Set<QName>> convertedMapping = new HashMap<String, Set<QName>>(17);
for (Map.Entry entry : mappingProperties.entrySet())
{
String documentProperty = (String) entry.getKey();
String qnamesStr = (String) entry.getValue();
if (documentProperty.startsWith(NAMESPACE_PROPERTY_PREFIX))
{
// Ignore these now
continue;
}
// Create the entry
Set<QName> qnames = new HashSet<QName>(3);
convertedMapping.put(documentProperty, qnames);
// The to value can be a list of QNames
StringTokenizer tokenizer = new StringTokenizer(qnamesStr, ",");
while (tokenizer.hasMoreTokens())
{
String qnameStr = tokenizer.nextToken().trim();
int index = qnameStr.indexOf(QName.NAMESPACE_PREFIX);
if (index > -1)
{
String prefix = qnameStr.substring(0, index);
String suffix = qnameStr.substring(index + 1);
// It is prefixed
String uri = namespacesByPrefix.get(prefix);
if (uri == null)
{
throw new AlfrescoRuntimeException(
"No prefix mapping for extracter property mapping: \n" +
" Extracter: " + this + "\n" +
" Mapping: " + entry);
}
qnameStr = QName.NAMESPACE_BEGIN + uri + QName.NAMESPACE_END + suffix;
}
try
{
QName qname = QName.createQName(qnameStr);
// Add it to the mapping
qnames.add(qname);
}
catch (InvalidQNameException e)
{
throw new AlfrescoRuntimeException(
"Can't create metadata extracter property mapping: \n" +
" Extracter: " + this + "\n" +
" Mapping: " + entry);
}
}
if (logger.isDebugEnabled())
{
logger.debug("Added mapping from " + documentProperty + " to " + qnames);
}
}
// Done
return convertedMapping;
}
/**
* Registers this instance of the extracter with the registry. This will call the
* {@link #init()} method and then register if the registry is available.
*
* @see #setRegistry(MetadataExtracterRegistry)
* @see #init()
*/
public final void register()
{
init();
// Register the extracter, if necessary
if (registry != null)
{
registry.register(this);
}
else
{
logger.warn("No registry provided. Not registering: " + this);
}
}
/**
* Provides a hook point for implementations to perform initialization. The base
* implementation must be invoked or the extracter will fail during extraction.
* The {@link #getDefaultMapping() default mappings} will be requested during
* initialization.
*/
protected void init()
{
Map<String, Set<QName>> defaultMapping = getDefaultMapping();
if (defaultMapping == null)
{
throw new AlfrescoRuntimeException("The metadata extracter must provide a default mapping: " + this);
}
// Was a mapping explicitly provided
if (mapping == null)
{
// No mapping, so use the default
mapping = defaultMapping;
}
else if (inheritDefaultMapping)
{
// Merge the default mapping into the configured mapping
for (String documentKey : defaultMapping.keySet())
{
Set<QName> systemQNames = mapping.get(documentKey);
if (systemQNames == null)
{
systemQNames = new HashSet<QName>(3);
mapping.put(documentKey, systemQNames);
}
Set<QName> defaultQNames = defaultMapping.get(documentKey);
systemQNames.addAll(defaultQNames);
}
}
// Were there any mappings
if (mapping.size() == 0)
{
logger.warn(
"There are no property mappings for the metadata extracter.\n" +
" Nothing will be extracted by: " + this);
}
// Done
initialized = true;
}
/** @inheritDoc */
public long getExtractionTime()
{
return extractionTime;
}
/**
* Checks if the mimetype is supported.
*
* @param reader the reader to check
* @throws AlfrescoRuntimeException if the mimetype is not supported
*/
protected void checkIsSupported(ContentReader reader)
{
String mimetype = reader.getMimetype();
if (!isSupported(mimetype))
{
throw new AlfrescoRuntimeException(
"Metadata extracter does not support mimetype: \n" +
" reader: " + reader + "\n" +
" supported: " + supportedMimetypes + "\n" +
" extracter: " + this);
}
}
/**
* @inheritDoc
*/
public final boolean extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
{
return extract(reader, this.overwritePolicy, destination, this.mapping);
}
/**
* @inheritDoc
*/
public final boolean extract(
ContentReader reader,
OverwritePolicy overwritePolicy,
Map<QName, Serializable> destination,
Map<String, Set<QName>> mapping) throws ContentIOException
{
// Done
if (logger.isDebugEnabled())
{
logger.debug("Starting metadata extraction: \n" +
" reader: " + reader + "\n" +
" extracter: " + this);
}
if (!initialized)
{
throw new AlfrescoRuntimeException(
"Metadata extracter not initialized.\n" +
" Call the 'register' method on: " + this + "\n" +
" Implementations of the 'init' method must call the base implementation.");
}
// check the reliability
checkIsSupported(reader);
boolean changed = false;
try
{
Map<String, Serializable> rawMetadata = extractRaw(reader);
// Convert to system properties (standalone)
Map<QName, Serializable> systemProperties = mapRawToSystem(rawMetadata);
// Now use the proper overwrite policy
changed = overwritePolicy.applyProperties(systemProperties, destination);
}
catch (Throwable e)
{
throw new ContentIOException("Metadata extraction failed: \n" +
" reader: " + reader,
e);
}
finally
{
// check that the reader was closed
if (!reader.isClosed())
{
logger.error("Content reader not closed by metadata extracter: \n" +
" reader: " + reader + "\n" +
" extracter: " + this);
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug("Completed metadata extraction: \n" +
" reader: " + reader + "\n" +
" extracter: " + this + "\n" +
" changed: " + changed);
}
return changed;
}
/**
*
* @param rawMetadata Metadata keyed by document properties
* @return Returns the metadata keyed by the system properties
*/
private Map<QName, Serializable> mapRawToSystem(Map<String, Serializable> rawMetadata)
{
Map<QName, Serializable> systemProperties = new HashMap<QName, Serializable>(rawMetadata.size() * 2 + 1);
for (Map.Entry<String, Serializable> entry : rawMetadata.entrySet())
{
String documentKey = entry.getKey();
// Check if there is a mapping for this
if (!mapping.containsKey(documentKey))
{
// No mapping - ignore
continue;
}
Serializable documentValue = entry.getValue();
Set<QName> systemQNames = mapping.get(documentKey);
for (QName systemQName : systemQNames)
{
systemProperties.put(systemQName, documentValue);
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug(
"Converted extracted raw values to system values: \n" +
" Raw Properties: " + rawMetadata + "\n" +
" System Properties: " + systemProperties);
}
return systemProperties;
}
/**
* Examines a value or string for nulls and adds it to the map (if non-empty). If the value
* is non-Serializable, then the <code>toString</code> representation used directly.
*
* @param key the destination map key
* @param value the value to check and put.
* @param destination map to put values into
* @return Returns <tt>true</tt> if set, <tt>false</tt> otherwise
*/
protected boolean putSafeRawValue(String key, Object value, Map<String, Serializable> destination)
{
if (value == null)
{
return false;
}
if (value instanceof String)
{
String svalue = ((String) value).trim();
if (svalue.length() > 0)
{
destination.put(key, svalue);
return true;
}
return false;
}
else if (value instanceof Serializable)
{
destination.put(key, (Serializable) value);
}
else
{
destination.put(key, value.toString());
}
return true;
}
/**
* This method provides a <i>best guess</i> of where to store the values extracted
* from the documents. The list of properties mapped by default need <b>not</b>
* include all properties extracted from the document; just the obvious set of mappings
* need be supplied.
* Implementations must either provide the default mapping properties in the expected
* location or override the method to provide the default mapping.
* <p>
* The default implementation looks for the default mapping file in the location
* given by the class name and <i>.properties</i>. If the extracter's class is
* <b>x.y.z.MyExtracter</b> then the default properties will be picked up at
* <b>classpath:/x/y/z/MyExtracter.properties</b>.
* Inner classes are supported, but the '$' in the class name is replaced with '-', so
* default properties for <b>x.y.z.MyStuff$MyExtracter</b> will be located using
* <b>x.y.z.MyStuff-MyExtracter.properties</b>.
* <p>
* The default mapping implementation should include thorough Javadocs so that the
* system administrators can accurately determine how to best enhance or override the
* default mapping.
* <p>
* If the default mapping is declared in a properties file, then the
* {@link #readMappingProperties(String)} method can be used to quickly generate the
* return value:
* <pre>
* protected Map<String, Set<QName>> getDefaultMapping()
* {
* return readMappingProperties(DEFAULT_MAPPING);
* }
* </pre>
* The map can also be created in code either statically or during the call.
*
* @return Returns the default, static mapping. It may not be null.
*
* @see #setInheritDefaultMapping(boolean inherit)
*/
protected Map<String, Set<QName>> getDefaultMapping()
{
String className = this.getClass().getName();
// Replace $
className = className.replace('$', '-');
// Replace .
className = className.replace('.', '/');
// Append .properties
String propertiesUrl = className + ".properties";
// Attempt to load the properties
return readMappingProperties(propertiesUrl);
}
/**
* Override to provide the raw extracted metadata values. An extracter should extract
* as many of the available properties as is realistically possible. Even if the
* {@link #getDefaultMapping() default mapping} doesn't handle all properties, it is
* possible for each instance of the extracter to be configured differently and more or
* less of the properties may be used in different installations.
* <p>
* Raw values must not be trimmed or removed for any reason. Null values and empty
* strings are
* <ul>
* <li><b>Null:</b> Removed</li>
* <li><b>Emtpty String:</b> Passed to the {@link OverwritePolicy}</li>
* <li><b>Non Serializable:</b> Converted to String or fails if that is not possible</li>
* </ul>
* <p>
* Properties extracted and their meanings and types should be thoroughly described in
* the class-level javadocs of the extracter implementation, for example:
* <pre>
* <b>editor:</b> - the document editor --> cm:author
* <b>title:</b> - the document title --> cm:title
* <b>user1:</b> - the document summary
* <b>user2:</b> - the document description --> cm:description
* <b>user3:</b> -
* <b>user4:</b> -
* </pre>
*
* @param reader the document to extract the values from. This stream provided by
* the reader must be closed if accessed directly.
* @return Returns a map of document property values keyed by property name.
* @throws All exception conditions can be handled.
*
* @see #getDefaultMapping()
*/
protected abstract Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable;
}

View File

@@ -0,0 +1,250 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import junit.framework.TestCase;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.filestore.FileContentReader;
import org.alfresco.repo.content.metadata.MetadataExtracter.OverwritePolicy;
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
/**
* @see org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter
*
* @author Derek Hulley
*/
public class AbstractMappingMetadataExtracterTest extends TestCase
{
private DummyMappingMetadataExtracter extracter;
private ContentReader reader;
private Map<QName, Serializable> destination;
@Override
protected void setUp() throws Exception
{
extracter = new DummyMappingMetadataExtracter();
extracter.register();
reader = new FileContentReader(AbstractContentTransformerTest.loadQuickTestFile("txt"));
reader.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
destination = new HashMap<QName, Serializable>(7);
destination.put(DummyMappingMetadataExtracter.QNAME_A1, JunkValue.INSTANCE);
destination.put(DummyMappingMetadataExtracter.QNAME_A2, "");
destination.put(DummyMappingMetadataExtracter.QNAME_B, null);
}
public void testSetUp()
{
assertNotNull(reader);
assertNotNull(extracter);
assertTrue("Extracter not initialized.", extracter.initCheck);
}
public void testDefaultExtract() throws Exception
{
destination.clear();
extracter.extract(reader, destination);
assertEquals(3, destination.size());
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A1));
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A2));
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_B));
}
public void testPropertyMappingOverride() throws Exception
{
Properties props = new Properties();
props.put("namespace.prefix.my", DummyMappingMetadataExtracter.NAMESPACE_MY);
props.put(DummyMappingMetadataExtracter.PROP_A, " my:a1, my:a2 ");
extracter.setMappingProperties(props);
extracter.register();
// Only mapped 'a'
destination.clear();
extracter.extract(reader, destination);
assertEquals(2, destination.size());
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A1));
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A2));
}
public void testPropertyMappingMerge() throws Exception
{
Properties props = new Properties();
props.put("namespace.prefix.my", DummyMappingMetadataExtracter.NAMESPACE_MY);
props.put(DummyMappingMetadataExtracter.PROP_A, " my:a3 ");
extracter.setMappingProperties(props);
extracter.setInheritDefaultMapping(true);
extracter.register();
// Added a3
destination.clear();
extracter.extract(reader, destination);
assertEquals(4, destination.size());
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A1));
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A2));
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_A3));
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_B));
}
public void testPropertyMappingOverrideExtra() throws Exception
{
Properties props = new Properties();
props.put("namespace.prefix.my", DummyMappingMetadataExtracter.NAMESPACE_MY);
props.put(DummyMappingMetadataExtracter.PROP_C, " my:c ");
props.put(DummyMappingMetadataExtracter.PROP_D, " my:d ");
props.put(DummyMappingMetadataExtracter.PROP_E, " my:e ");
extracter.setMappingProperties(props);
extracter.register();
// Added a3
destination.clear();
extracter.extract(reader, destination);
assertEquals(2, destination.size());
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_C));
assertTrue(destination.containsKey(DummyMappingMetadataExtracter.QNAME_D));
}
public void testOverwritePolicyEager()
{
extracter.setOverwritePolicy(OverwritePolicy.EAGER);
extracter.extract(reader, destination);
assertEquals(3, destination.size());
assertEquals(DummyMappingMetadataExtracter.VALUE_A, destination.get(DummyMappingMetadataExtracter.QNAME_A1));
assertEquals(DummyMappingMetadataExtracter.VALUE_A, destination.get(DummyMappingMetadataExtracter.QNAME_A2));
assertEquals(DummyMappingMetadataExtracter.VALUE_B, destination.get(DummyMappingMetadataExtracter.QNAME_B));
}
public void testOverwritePolicyPragmatic()
{
extracter.setOverwritePolicy(OverwritePolicy.PRAGMATIC);
extracter.extract(reader, destination);
assertEquals(3, destination.size());
assertEquals(JunkValue.INSTANCE, destination.get(DummyMappingMetadataExtracter.QNAME_A1));
assertEquals(DummyMappingMetadataExtracter.VALUE_A, destination.get(DummyMappingMetadataExtracter.QNAME_A2));
assertEquals(DummyMappingMetadataExtracter.VALUE_B, destination.get(DummyMappingMetadataExtracter.QNAME_B));
}
public void testOverwritePolicyCautious()
{
extracter.setOverwritePolicy(OverwritePolicy.CAUTIOUS);
extracter.extract(reader, destination);
assertEquals(3, destination.size());
assertEquals(JunkValue.INSTANCE, destination.get(DummyMappingMetadataExtracter.QNAME_A1));
assertEquals("", destination.get(DummyMappingMetadataExtracter.QNAME_A2));
assertEquals(null, destination.get(DummyMappingMetadataExtracter.QNAME_B));
}
/**
* A spoofed-up extracter that extracts the following:
* <pre>
* <b>a:</b> - A --> my:a1, my:a2
* <b>b:</b> - B --> my:b
* <b>c:</b> - C
* <b>d:</b> - D
* </pre>
* @author Derek Hulley
*/
public static class DummyMappingMetadataExtracter extends AbstractMappingMetadataExtracter
{
public static final String PROP_A = "a";
public static final String PROP_B = "b";
public static final String PROP_C = "c";
public static final String PROP_D = "d";
public static final String PROP_E = "e";
public static final String VALUE_A = "AAA";
public static final String VALUE_B = "BBB";
public static final String VALUE_C = "CCC";
public static final String VALUE_D = "DDD";
public static final String NAMESPACE_MY = "http://DummyMappingMetadataExtracter";
public static final QName QNAME_A1 = QName.createQName(NAMESPACE_MY, "a1");
public static final QName QNAME_A2 = QName.createQName(NAMESPACE_MY, "a2");
public static final QName QNAME_A3 = QName.createQName(NAMESPACE_MY, "a3");
public static final QName QNAME_B = QName.createQName(NAMESPACE_MY, "b");
public static final QName QNAME_C = QName.createQName(NAMESPACE_MY, "c");
public static final QName QNAME_D = QName.createQName(NAMESPACE_MY, "d");
public static final QName QNAME_E = QName.createQName(NAMESPACE_MY, "e"); // not extracted
private static final Set<String> MIMETYPES;
static
{
MIMETYPES = new HashSet<String>(5);
MIMETYPES.add(MimetypeMap.MIMETYPE_TEXT_PLAIN);
MIMETYPES.add(MimetypeMap.MIMETYPE_XML);
}
Map<String, Set<QName>> defaultMapping;
private boolean initCheck;
public DummyMappingMetadataExtracter()
{
super(MIMETYPES);
initCheck = false;
}
@Override
protected void init()
{
defaultMapping = new HashMap<String, Set<QName>>(7);
defaultMapping.put(PROP_A, new HashSet<QName>(Arrays.asList(QNAME_A1, QNAME_A2)));
defaultMapping.put(PROP_B, new HashSet<QName>(Arrays.asList(QNAME_B)));
initCheck = true;
super.init();
}
@Override
protected Map<String, Set<QName>> getDefaultMapping()
{
return defaultMapping;
}
@Override
protected Map<String, Serializable> extractRaw(ContentReader reader)
{
reader.getContentString();
Map<String, Serializable> ret = new HashMap<String, Serializable>(7);
ret.put(PROP_A, VALUE_A);
ret.put(PROP_B, VALUE_B);
ret.put(PROP_C, VALUE_C);
ret.put(PROP_D, VALUE_D);
return ret;
}
}
private static class JunkValue implements Serializable
{
private static final JunkValue INSTANCE = new JunkValue();
private static final long serialVersionUID = 1L;
}
}

View File

@@ -38,6 +38,9 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Support class for metadata extracters.
*
* @deprecated Use the {@link org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter}
*
* @author Jesper Steen Møller
*/
@@ -123,6 +126,18 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter
return 0.0;
}
/**
* @inheritDoc
*
* @return Returns <tt>true</tt> if the {@link #getReliability(String) reliability}
* is greater than 0
*/
public boolean isSupported(String mimetype)
{
double reliability = getReliability(mimetype);
return reliability > 0.0;
}
public long getExtractionTime()
{
return extractionTime;
@@ -147,7 +162,10 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter
}
}
public final void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
/**
* @inheritDoc
*/
public boolean extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
{
// check the reliability
checkReliability(reader);
@@ -180,14 +198,24 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter
" reader: " + reader + "\n" +
" extracter: " + this);
}
return true;
}
public final void extract(
/**
* @inheritDoc
*
* @param overwritePolicy ignored
* @param propertyMapping ignored
*
* @see #extract(ContentReader, Map)
*/
public final boolean extract(
ContentReader reader,
OverwritePolicy overwritePolicy,
Map<QName, Serializable> destination,
Map<String, QName> propertyMapping) throws ContentIOException
Map<String, Set<QName>> propertyMapping) throws ContentIOException
{
throw new UnsupportedOperationException();
return extract(reader, destination);
}
/**
@@ -197,12 +225,13 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter
* @param reader the source of the content
* @param destination the property map to fill
* @throws Throwable an exception
*
* @deprecated Consider deriving from the more configurable {@link AbstractMappingMetadataExtracter}
*/
protected abstract void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable;
/**
* Examines a value or string for nulls and adds it to the map (if
* non-empty)
* Examines a value or string for nulls and adds it to the map (if non-empty)
*
* @param prop Alfresco's <code>ContentModel.PROP_</code> to set.
* @param value Value to set it to

View File

@@ -26,6 +26,7 @@ package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Map;
import java.util.Set;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
@@ -40,16 +41,171 @@ import org.alfresco.service.namespace.QName;
public interface MetadataExtracter
{
/**
* Provides the approximate accuracy with which this extracter can extract
* metadata for the mimetype.
* <p>
* A enumeration of functional property overwrite policies. These determine whether extracted properties are
* written into the property map or not.
*
* @param sourceMimetype the source mimetype
* @return Returns a score 0.0 to 1.0. 0.0 indicates that the extraction
* cannot be performed at all. 1.0 indicates that the extraction can
* be performed perfectly.
* @author Derek Hulley
*/
public double getReliability(String sourceMimetype);
public enum OverwritePolicy
{
/**
* This policy puts the new value if:
* <ul>
* <li>the extracted property is not null</li>
* </ul>
*/
EAGER
{
@Override
public boolean applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
{
boolean modified = false;
for (Map.Entry<QName, Serializable> entry : extractedProperties.entrySet())
{
QName propertyQName = entry.getKey();
Serializable extractedValue = entry.getValue();
// Ignore null extracted value
if (extractedValue == null)
{
continue;
}
targetProperties.put(propertyQName, extractedValue);
modified = true;
}
return modified;
}
},
/**
* This policy puts the new value if:
* <ul>
* <li>the extracted property is not null</li>
* <li>there is no target key for the property</li>
* <li>the target value is null</li>
* <li>the string representation of the target value is an empty string</li>
* </ul>
*/
PRAGMATIC
{
@Override
public boolean applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
{
/*
* Negative and positive checks are mixed in the loop.
*/
boolean modified = false;
for (Map.Entry<QName, Serializable> entry : extractedProperties.entrySet())
{
QName propertyQName = entry.getKey();
Serializable extractedValue = entry.getValue();
// Ignore null extracted value
if (extractedValue == null)
{
continue;
}
// Handle the shortcut cases where the target value is missing or null
if (!targetProperties.containsKey(propertyQName))
{
// There is nothing currently
targetProperties.put(propertyQName, extractedValue);
modified = true;
continue;
}
Serializable originalValue = targetProperties.get(propertyQName);
if (originalValue == null)
{
// The current value is null
targetProperties.put(propertyQName, extractedValue);
modified = true;
continue;
}
// Check the string representation
if (originalValue instanceof String)
{
String originalValueStr = (String) originalValue;
if (originalValueStr != null && originalValueStr.length() > 0)
{
// The original value is non-trivial
continue;
}
else
{
// The original string is trivial
targetProperties.put(propertyQName, extractedValue);
modified = true;
continue;
}
}
// We have some other object as the original value, so keep it
}
return modified;
}
},
/**
* This policy only puts the extracted value if there is no value (null or otherwise) in the properties map.
* It is assumed that the mere presence of a property key is enough to inidicate that the target property
* is as intented.
* This policy puts the new value if:
* <ul>
* <li>the extracted property is not null</li>
* <li>there is no target key for the property</li>
* </ul>
*/
CAUTIOUS
{
@Override
public boolean applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
{
boolean modified = false;
for (Map.Entry<QName, Serializable> entry : extractedProperties.entrySet())
{
QName propertyQName = entry.getKey();
Serializable extractedValue = entry.getValue();
// Ignore null extracted value
if (extractedValue == null)
{
continue;
}
// Is the key present in the target values
if (targetProperties.containsKey(propertyQName))
{
// Cautiously bypass the value as there is one already
continue;
}
targetProperties.put(propertyQName, extractedValue);
modified = true;
}
return modified;
}
};
/**
* Apply the overwrite policy for the extracted properties.
*
* @return Returns true if <i>any</i> properties were set on the target properties
*/
public boolean applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
{
throw new UnsupportedOperationException("Override this method");
}
};
/**
* Get an estimate of the extracter's reliability on a scale from 0.0 to 1.0.
*
* @param mimetype the mimetype to check
* @return Returns a reliability indicator from 0.0 to 1.0
*
* @deprecated This method is replaced by {@link #isSupported(String)}
*/
public double getReliability(String mimetype);
/**
* Determines if the extracter works against the given mimetype.
*
* @param mimetype the document mimetype
* @return Returns <tt>true</tt> if the mimetype is supported, otherwise <tt>false</tt>.
*/
public boolean isSupported(String mimetype);
/**
* Provides an estimate, usually a worst case guess, of how long an
@@ -63,41 +219,51 @@ public interface MetadataExtracter
public long getExtractionTime();
/**
* Extracts the metadata from the content provided by the reader and source
* mimetype to the supplied map.
* Extracts the metadata values from the content provided by the reader and source
* mimetype to the supplied map. The internal mapping and {@link OverwritePolicy overwrite policy}
* between document metadata and system metadata will be used.
* <p>
* The extraction viability can be determined by an up front call to
* {@link #getReliability(String)}.
* The extraction viability can be determined by an up front call to {@link #isSupported(String)}.
* <p>
* The source mimetype <b>must</b> be available on the
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
* of the reader.
* <p>
* <b>Note:</b> Internally, the extracter may need to perform a mapping of document-specific
* properties to <code>QName</code>. This is an implementation detail that is
* supported in the default abstract implementations.
*
* @param reader the source of the content
* @param destination the map of properties to populate (essentially a return value)
* @return Returns <tt>true</tt> if the destination map was modified
* @throws ContentIOException if a detectable error occurs
*
* @see #extract(ContentReader, Map, Map)
* @see #extract(ContentReader, OverwritePolicy, Map, Map)
*/
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException;
public boolean extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException;
/**
*
* Extracts the metadata from the content provided by the reader and source
* mimetype to the supplied map. The mapping from document metadata to system metadata
* is explicitly provided. The {@link OverwritePolicy overwrite policy} is also explictly
* set.
* <p>
* The extraction viability can be determined by an up front call to
* {@link #isSupported(String)}.
* <p>
* The source mimetype <b>must</b> be available on the
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
* of the reader.
*
* @param reader the source of the content
* @param overwritePolicy the policy stipulating how the system properties must be
* overwritten if present
* @param destination the map of properties to populate (essentially a return value)
* @param propertyMapping a mapping of internal (document-specific properties) to system
* properties.
* @param mapping a mapping of document-specific properties to system properties.
* @return Returns <tt>true</tt> if the destination map was modified
* @throws ContentIOException if a detectable error occurs
*
* @see #extract(ContentReader, Map)
*/
public void extract(
public boolean extract(
ContentReader reader,
OverwritePolicy overwritePolicy,
Map<QName, Serializable> destination,
Map<String, QName> propertyMapping) throws ContentIOException;
Map<String, Set<QName>> mapping) throws ContentIOException;
}

View File

@@ -28,14 +28,13 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
@@ -47,9 +46,16 @@ import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
* Office file format Metadata Extracter
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/
public class OfficeMetadataExtracter extends AbstractMetadataExtracter
public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter
{
public static final String PROP_AUTHOR = "author";
public static final String PROP_TITLE = "title";
public static final String PROP_SUBJECT = "subject";
public static final String PROP_CREATE_DATETIME = "createDateTime";
public static final String PROP_LAST_SAVE_DATETIME = "lastSaveDateTime";
public static String[] SUPPORTED_MIMETYPES = new String[] {
MimetypeMap.MIMETYPE_WORD,
MimetypeMap.MIMETYPE_EXCEL,
@@ -57,11 +63,14 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
public OfficeMetadataExtracter()
{
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)), 1.0, 1000);
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
}
public void extractInternal(ContentReader reader, final Map<QName, Serializable> destination) throws Throwable
@Override
protected Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
final Map<String, Serializable> rawProperties = new HashMap<String, Serializable>(17);
POIFSReaderListener readerListener = new POIFSReaderListener()
{
public void processPOIFSReaderEvent(final POIFSReaderEvent event)
@@ -73,14 +82,11 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
{
SummaryInformation si = (SummaryInformation) ps;
// Titled aspect
trimPut(ContentModel.PROP_TITLE, si.getTitle(), destination);
trimPut(ContentModel.PROP_DESCRIPTION, si.getSubject(), destination);
// Auditable aspect
trimPut(ContentModel.PROP_CREATED, si.getCreateDateTime(), destination);
trimPut(ContentModel.PROP_MODIFIED, si.getLastSaveDateTime(), destination);
trimPut(ContentModel.PROP_AUTHOR, si.getAuthor(), destination);
putSafeRawValue(PROP_AUTHOR, si.getAuthor(), rawProperties);
putSafeRawValue(PROP_TITLE, si.getTitle(), rawProperties);
putSafeRawValue(PROP_SUBJECT, si.getSubject(), rawProperties);
putSafeRawValue(PROP_CREATE_DATETIME, si.getCreateDateTime(), rawProperties);
putSafeRawValue(PROP_LAST_SAVE_DATETIME, si.getLastSaveDateTime(), rawProperties);
}
}
catch (Exception ex)
@@ -105,5 +111,6 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
try { is.close(); } catch (IOException e) {}
}
}
return rawProperties;
}
}

View File

@@ -0,0 +1,14 @@
#
# OfficeMetadataExtracter - default mapping
#
# author: Derek Hulley
# Namespaces
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
# Mappings
author=cm:author
title=cm:title
subject=cm:description
createDateTime=cm:created
lastSaveDateTime=cm:modified

View File

@@ -8,13 +8,14 @@ package org.alfresco.repo.content.metadata;
*/
public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
{
private MetadataExtracter extracter;
private OfficeMetadataExtracter extracter;
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = new OfficeMetadataExtracter();
extracter.register();
}
/**

View File

@@ -99,7 +99,7 @@ public abstract class AbstractContentTransformerTest extends BaseSpringTest
* Helper method to load one of the "The quick brown fox" files from the
* classpath.
*
* @param extension the extension of the file required
* @param extension the extension of the file required, e.g. <b>txt</b>
* @return Returns a test resource loaded from the classpath or <tt>null</tt> if
* no resource could be found.
* @throws IOException