> mapping)
{
this.mapping = mapping;
}
/**
* Set the properties that contain the mapping from document metadata to system metadata.
* This is an alternative to the {@link #setMapping(Map)} method. Any mappings already
* present will be cleared out.
*
* The property mapping is of the form:
*
* # Namespaces prefixes
* namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
* namespace.prefix.my=http://www....com/alfresco/1.0
*
* # Mapping
* editor=cm:author, my:editor
* title=cm:title
* user1=cm:summary
* user2=cm:description
*
* The mapping can therefore be from a single document property onto several system properties.
*
* @param mappingProperties the properties that map document properties to system properties
*/
public void setMappingProperties(Properties mappingProperties)
{
mapping = readMappingProperties(mappingProperties);
}
/**
* Helper method for derived classes to obtain the mappings that will be applied to raw
* values. This should be called after initialization in order to guarantee the complete
* map is given.
*
* Normally, the list of properties that can be extracted from a document is fixed and
* well-known - in that case, just extract everything. But Some implementations may have
* an extra, indeterminate set of values available for extraction. If the extraction of
* these runtime parameters is expensive, then the keys provided by the return value can
* be used to extract values from the documents. The metadata extraction becomes fully
* configuration-driven, i.e. declaring further mappings will result in more values being
* extracted from the documents.
*
* Most extractors will not be using this method. For an example of its use, see the
* {@linkplain OpenDocumentMetadataExtracter OpenDocument extractor}, which uses the mapping
* to select specific user properties from a document.
*/
protected final Map> getMapping()
{
if (!initialized)
{
throw new UnsupportedOperationException("The complete mapping is only available after initialization.");
}
return Collections.unmodifiableMap(mapping);
}
/**
* A utility method to read mapping properties from a resource file and convert to the map form.
*
* @param propertiesUrl A standard Properties file URL location
*
* @see #setMappingProperties(Properties)
*/
protected Map> readMappingProperties(String propertiesUrl)
{
InputStream is = null;
try
{
is = getClass().getClassLoader().getResourceAsStream(propertiesUrl);
if(is == null)
{
throw new AlfrescoRuntimeException(
"Metadata Extracter mapping properties not found: \n" +
" Extracter: " + this + "\n" +
" Bundle: " + propertiesUrl);
}
Properties props = new Properties();
props.load(is);
// Process it
Map> map = readMappingProperties(props);
// Done
if (logger.isDebugEnabled())
{
logger.debug("Loaded mapping properties from resource: " + propertiesUrl);
}
return map;
}
catch (Throwable e)
{
throw new AlfrescoRuntimeException(
"Unable to load properties file to read extracter mapping properties: \n" +
" Extracter: " + this + "\n" +
" Bundle: " + propertiesUrl,
e);
}
finally
{
if (is != null)
{
try { is.close(); } catch (Throwable e) {}
}
}
}
/**
* A utility method to convert mapping properties to the Map form.
*
* @see #setMappingProperties(Properties)
*/
@SuppressWarnings("unchecked")
protected Map> readMappingProperties(Properties mappingProperties)
{
Map namespacesByPrefix = new HashMap(5);
// Get the namespaces
for (Map.Entry entry : mappingProperties.entrySet())
{
String propertyName = (String) entry.getKey();
if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX))
{
String prefix = propertyName.substring(17);
String namespace = (String) entry.getValue();
namespacesByPrefix.put(prefix, namespace);
}
}
// Create the mapping
Map> convertedMapping = new HashMap>(17);
for (Map.Entry entry : mappingProperties.entrySet())
{
String documentProperty = (String) entry.getKey();
String qnamesStr = (String) entry.getValue();
if (documentProperty.startsWith(NAMESPACE_PROPERTY_PREFIX))
{
// Ignore these now
continue;
}
// Create the entry
Set qnames = new HashSet(3);
convertedMapping.put(documentProperty, qnames);
// The to value can be a list of QNames
StringTokenizer tokenizer = new StringTokenizer(qnamesStr, ",");
while (tokenizer.hasMoreTokens())
{
String qnameStr = tokenizer.nextToken().trim();
// Check if we need to resolve a namespace reference
int index = qnameStr.indexOf(QName.NAMESPACE_PREFIX);
if (index > -1 && qnameStr.charAt(0) != QName.NAMESPACE_BEGIN)
{
String prefix = qnameStr.substring(0, index);
String suffix = qnameStr.substring(index + 1);
// It is prefixed
String uri = namespacesByPrefix.get(prefix);
if (uri == null)
{
throw new AlfrescoRuntimeException(
"No prefix mapping for extracter property mapping: \n" +
" Extracter: " + this + "\n" +
" Mapping: " + entry);
}
qnameStr = QName.NAMESPACE_BEGIN + uri + QName.NAMESPACE_END + suffix;
}
try
{
QName qname = QName.createQName(qnameStr);
// Add it to the mapping
qnames.add(qname);
}
catch (InvalidQNameException e)
{
throw new AlfrescoRuntimeException(
"Can't create metadata extracter property mapping: \n" +
" Extracter: " + this + "\n" +
" Mapping: " + entry);
}
}
if (logger.isDebugEnabled())
{
logger.debug("Added mapping from " + documentProperty + " to " + qnames);
}
}
// Done
return convertedMapping;
}
/**
* Registers this instance of the extracter with the registry. This will call the
* {@link #init()} method and then register if the registry is available.
*
* @see #setRegistry(MetadataExtracterRegistry)
* @see #init()
*/
public final void register()
{
init();
// Register the extracter, if necessary
if (registry != null)
{
registry.register(this);
}
}
/**
* Provides a hook point for implementations to perform initialization. The base
* implementation must be invoked or the extracter will fail during extraction.
* The {@link #getDefaultMapping() default mappings} will be requested during
* initialization.
*/
protected void init()
{
Map> defaultMapping = getDefaultMapping();
if (defaultMapping == null)
{
throw new AlfrescoRuntimeException("The metadata extracter must provide a default mapping: " + this);
}
// Was a mapping explicitly provided
if (mapping == null)
{
// No mapping, so use the default
mapping = defaultMapping;
}
else if (inheritDefaultMapping)
{
// Merge the default mapping into the configured mapping
for (String documentKey : defaultMapping.keySet())
{
Set systemQNames = mapping.get(documentKey);
if (systemQNames == null)
{
systemQNames = new HashSet(3);
mapping.put(documentKey, systemQNames);
}
Set defaultQNames = defaultMapping.get(documentKey);
systemQNames.addAll(defaultQNames);
}
}
// The configured mappings are empty, but there were default mappings
if (mapping.size() == 0 && defaultMapping.size() > 0)
{
logger.warn(
"There are no property mappings for the metadata extracter.\n" +
" Nothing will be extracted by: " + this);
}
// Done
initialized = true;
}
/** {@inheritDoc} */
public long getExtractionTime()
{
return 1000L;
}
/**
* Checks if the mimetype is supported.
*
* @param reader the reader to check
* @throws AlfrescoRuntimeException if the mimetype is not supported
*/
protected void checkIsSupported(ContentReader reader)
{
String mimetype = reader.getMimetype();
if (!isSupported(mimetype))
{
throw new AlfrescoRuntimeException(
"Metadata extracter does not support mimetype: \n" +
" reader: " + reader + "\n" +
" supported: " + supportedMimetypes + "\n" +
" extracter: " + this);
}
}
/**
* {@inheritDoc}
*/
public final Map extract(ContentReader reader, Map destination)
{
return extract(reader, this.overwritePolicy, destination, this.mapping);
}
/**
* {@inheritDoc}
*/
public final Map extract(
ContentReader reader,
OverwritePolicy overwritePolicy,
Map destination)
{
return extract(reader, overwritePolicy, destination, this.mapping);
}
/**
* {@inheritDoc}
*/
public Map extract(
ContentReader reader,
OverwritePolicy overwritePolicy,
Map destination,
Map> mapping)
{
// Done
if (logger.isDebugEnabled())
{
logger.debug("Starting metadata extraction: \n" +
" reader: " + reader + "\n" +
" extracter: " + this);
}
if (!initialized)
{
throw new AlfrescoRuntimeException(
"Metadata extracter not initialized.\n" +
" Call the 'register' method on: " + this + "\n" +
" Implementations of the 'init' method must call the base implementation.");
}
// check the reliability
checkIsSupported(reader);
Map changedProperties = null;
try
{
Map rawMetadata = null;
// Check that the content has some meat
if (reader.getSize() > 0 && reader.exists())
{
rawMetadata = extractRaw(reader);
}
else
{
rawMetadata = new HashMap(1);
}
// Convert to system properties (standalone)
Map systemProperties = mapRawToSystem(rawMetadata);
// Convert the properties according to the dictionary types
systemProperties = convertSystemPropertyValues(systemProperties);
// Last chance to filter the system properties map before applying them
filterSystemProperties(systemProperties, destination);
// Now use the proper overwrite policy
changedProperties = overwritePolicy.applyProperties(systemProperties, destination);
}
catch (Throwable e)
{
// Ask Tika to detect the document, and report back on if
// the current mime type is plausible
String typeErrorMessage = null;
String differentType = mimetypeService.getMimetypeIfNotMatches(reader.getReader());
if(differentType != null)
{
typeErrorMessage = "\n" +
" claimed mime type: " + reader.getMimetype() + "\n" +
" detected mime type: " + differentType;
}
if (logger.isDebugEnabled())
{
logger.debug(
"Metadata extraction failed: \n" +
" Extracter: " + this + "\n" +
" Content: " + reader +
typeErrorMessage,
e);
}
else
{
logger.warn(
"Metadata extraction failed (turn on DEBUG for full error): \n" +
" Extracter: " + this + "\n" +
" Content: " + reader + "\n" +
" Failure: " + e.getMessage() +
typeErrorMessage);
}
}
finally
{
// check that the reader was closed (if used)
if (reader.isChannelOpen())
{
logger.error("Content reader not closed by metadata extracter: \n" +
" reader: " + reader + "\n" +
" extracter: " + this);
}
// Make sure that we have something to return
if (changedProperties == null)
{
changedProperties = new HashMap(0);
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug("Completed metadata extraction: \n" +
" reader: " + reader + "\n" +
" extracter: " + this + "\n" +
" changed: " + changedProperties);
}
return changedProperties;
}
/**
*
* @param rawMetadata Metadata keyed by document properties
* @return Returns the metadata keyed by the system properties
*/
private Map mapRawToSystem(Map rawMetadata)
{
Map systemProperties = new HashMap(rawMetadata.size() * 2 + 1);
for (Map.Entry entry : rawMetadata.entrySet())
{
String documentKey = entry.getKey();
// Check if there is a mapping for this
if (!mapping.containsKey(documentKey))
{
// No mapping - ignore
continue;
}
Serializable documentValue = entry.getValue();
Set systemQNames = mapping.get(documentKey);
for (QName systemQName : systemQNames)
{
systemProperties.put(systemQName, documentValue);
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug(
"Converted extracted raw values to system values: \n" +
" Raw Properties: " + rawMetadata + "\n" +
" System Properties: " + systemProperties);
}
return systemProperties;
}
/**
* Filters the system properties that are going to be applied. Gives the metadata extracter an
* opportunity to remove properties that may not be appropriate in a given context.
*
* @param systemProperties map of system properties to be applied
* @param targetProperties map of target properties, may be used to provide to the context requried
*/
protected void filterSystemProperties(Map systemProperties, Map targetProperties)
{
// Default implementation does nothing
}
/**
* Converts all values according to their dictionary-defined type. This uses the
* {@link #setFailOnTypeConversion(boolean) failOnTypeConversion flag} to determine how failures
* are handled i.e. if values fail to convert, the process may discard the property.
*
* @param systemProperties the values keyed to system property names
* @return Returns a modified map of properties that have been converted.
*/
@SuppressWarnings("unchecked")
private Map convertSystemPropertyValues(Map systemProperties)
{
Map convertedProperties = new HashMap(systemProperties.size() + 7);
for (Map.Entry entry : systemProperties.entrySet())
{
QName propertyQName = entry.getKey();
Serializable propertyValue = entry.getValue();
// Get the property definition
PropertyDefinition propertyDef = (dictionaryService == null) ? null : dictionaryService.getProperty(propertyQName);
if (propertyDef == null)
{
// There is nothing in the DD about this so just transfer it
convertedProperties.put(propertyQName, propertyValue);
continue;
}
// It is in the DD, so attempt the conversion
DataTypeDefinition propertyTypeDef = propertyDef.getDataType();
Serializable convertedPropertyValue = null;
try
{
// Attempt to make any date conversions
if (propertyTypeDef.getName().equals(DataTypeDefinition.DATE) || propertyTypeDef.getName().equals(DataTypeDefinition.DATETIME))
{
if (propertyValue instanceof Date)
{
convertedPropertyValue = propertyValue;
}
else if (propertyValue instanceof Collection)
{
convertedPropertyValue = (Serializable) makeDates((Collection) propertyValue);
}
else if (propertyValue instanceof String)
{
convertedPropertyValue = makeDate((String) propertyValue);
}
else
{
if (logger.isWarnEnabled())
{
StringBuilder mesg = new StringBuilder();
mesg.append("Unable to convert Date property: ").append(propertyQName)
.append(", value: ").append(propertyValue).append(", type: ").append(propertyTypeDef.getName());
logger.warn(mesg.toString());
}
}
}
else
{
if (propertyValue instanceof Collection)
{
convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert(
propertyTypeDef,
(Collection) propertyValue);
}
else if (propertyValue instanceof Object[])
{
convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert(
propertyTypeDef,
(Object[]) propertyValue);
}
else
{
convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert(
propertyTypeDef,
propertyValue);
}
}
convertedProperties.put(propertyQName, convertedPropertyValue);
}
catch (TypeConversionException e)
{
// Do we just absorb this or is it a problem?
if (failOnTypeConversion)
{
throw AlfrescoRuntimeException.create(
e,
ERR_TYPE_CONVERSION,
this,
propertyQName,
propertyTypeDef.getName(),
propertyValue);
}
}
}
// Done
return convertedProperties;
}
/**
* Convert a collection of date String to Date objects
*/
private Collection makeDates(Collection dateStrs)
{
List dates = new ArrayList(dateStrs.size());
for (String dateStr : dateStrs)
{
Date date = makeDate(dateStr);
dates.add(date);
}
return dates;
}
/**
* Convert a date String to a Date object
*/
protected Date makeDate(String dateStr)
{
if (dateStr == null || dateStr.length() == 0)
{
return null;
}
Date date = null;
try
{
date = DefaultTypeConverter.INSTANCE.convert(Date.class, dateStr);
}
catch (TypeConversionException e)
{
// Try one of the other formats
for (DateFormat df : this.supportedDateFormats)
{
try
{
date = df.parse(dateStr);
}
catch (ParseException ee)
{
// Didn't work
}
}
if (date == null)
{
// Still no luck
throw new TypeConversionException("Unable to convert string to date: " + dateStr);
}
}
return date;
}
/**
* Adds a value to the map, conserving null values. Values are converted to null if:
*
* - it is an empty string value after trimming
* - it is an empty collection
* - it is an empty array
*
* String values are trimmed before being put into the map.
* Otherwise, it is up to the extracter to ensure that the value is a Serializable.
* It is not appropriate to implicitly convert values in order to make them Serializable
* - the best conversion method will depend on the value's specific meaning.
*
* @param key the destination key
* @param value the serializable value
* @param destination the map to put values into
* @return Returns true if set, otherwise false
*/
@SuppressWarnings("unchecked")
protected boolean putRawValue(String key, Serializable value, Map destination)
{
if (value == null)
{
// Just keep this
}
else if (value instanceof String)
{
String valueStr = ((String) value).trim();
if (valueStr.length() == 0)
{
value = null;
}
else
{
// Keep the trimmed value
value = valueStr;
}
}
else if (value instanceof Collection)
{
Collection valueCollection = (Collection) value;
if (valueCollection.isEmpty())
{
value = null;
}
}
else if (value.getClass().isArray())
{
if (Array.getLength(value) == 0)
{
value = null;
}
}
// It passed all the tests
destination.put(key, value);
return true;
}
/**
* Helper method to fetch a clean map into which raw values can be dumped.
*
* @return Returns an empty map
*/
protected final Map newRawMap()
{
return new HashMap(17);
}
/**
* This method provides a best guess of where to store the values extracted
* from the documents. The list of properties mapped by default need not
* include all properties extracted from the document; just the obvious set of mappings
* need be supplied.
* Implementations must either provide the default mapping properties in the expected
* location or override the method to provide the default mapping.
*
* The default implementation looks for the default mapping file in the location
* given by the class name and .properties. If the extracter's class is
* x.y.z.MyExtracter then the default properties will be picked up at
* classpath:/x/y/z/MyExtracter.properties.
* Inner classes are supported, but the '$' in the class name is replaced with '-', so
* default properties for x.y.z.MyStuff$MyExtracter will be located using
* x.y.z.MyStuff-MyExtracter.properties.
*
* The default mapping implementation should include thorough Javadocs so that the
* system administrators can accurately determine how to best enhance or override the
* default mapping.
*
* If the default mapping is declared in a properties file other than the one named after
* the class, then the {@link #readMappingProperties(String)} method can be used to quickly
* generate the return value:
*
* protected Map<> getDefaultMapping()
* {
* return readMappingProperties(DEFAULT_MAPPING);
* }
*
* The map can also be created in code either statically or during the call.
*
* @return Returns the default, static mapping. It may not be null.
*
* @see #setInheritDefaultMapping(boolean inherit)
*/
protected Map> getDefaultMapping()
{
String className = this.getClass().getName();
// Replace $
className = className.replace('$', '-');
// Replace .
className = className.replace('.', '/');
// Append .properties
String propertiesUrl = className + ".properties";
// Attempt to load the properties
return readMappingProperties(propertiesUrl);
}
/**
* Override to provide the raw extracted metadata values. An extracter should extract
* as many of the available properties as is realistically possible. Even if the
* {@link #getDefaultMapping() default mapping} doesn't handle all properties, it is
* possible for each instance of the extracter to be configured differently and more or
* less of the properties may be used in different installations.
*
* Raw values must not be trimmed or removed for any reason. Null values and empty
* strings are
*
* - Null: Removed
* - Empty String: Passed to the {@link OverwritePolicy}
* - Non Serializable: Converted to String or fails if that is not possible
*
*
* Properties extracted and their meanings and types should be thoroughly described in
* the class-level javadocs of the extracter implementation, for example:
*
* editor: - the document editor --> cm:author
* title: - the document title --> cm:title
* user1: - the document summary
* user2: - the document description --> cm:description
* user3: -
* user4: -
*
*
* @param reader the document to extract the values from. This stream provided by
* the reader must be closed if accessed directly.
* @return Returns a map of document property values keyed by property name.
* @throws All exception conditions can be handled.
*
* @see #getDefaultMapping()
*/
protected abstract Map extractRaw(ContentReader reader) throws Throwable;
}