Merged BRANCHES/DEV/RGAUSS/4.2-CORE-CHANGES-42861 to HEAD:

42862: Creating new branch from HEAD
   43026: ALF-16403: Create the Basic Interfaces and Implementation for Metadata Embedders
        - Added MetadataEmbedder interface which guarantees an embed method responsible for writing the given metadata into a given content writer
        - Changed AbstractMappingMetadataExtracter to implement MetadataEmbedder
           * Added supportedEmbedMimetypes and constructor which takes it and supportedMimetypes as arguments
           * Added embedMapping
           * Added inheritDefaultEmbedMapping
           * Added isEmbeddingSupported
           * Added setEmbedMappingProperties
           * Added readEmbedMappingProperties for reading classname.embed.properties
           * Added setting of embedMapping in init method
           * Added checkIsEmbedSupported method
           * Added embed method which checks support for the mimetype, and calls embedInteral which implementations should override
           * Added mapSystemToRaw method, essentially a reverse of existing mapRawToSystem
           * Added getDefaultEmbedMapping method which assumes a reverse mapping of extract mapping if no explicit embed overrides are present
           * Added empty embedInternal method which does nothing rather than abstract method to minimize changes to existing code
        - Added notion of MetadataEmbedders to MetadataExtracterRegistry
           * Added embedderCache but use the existing extracterCache* locks
           * Added findBestEmbedders method
           * Added getEmbedder method
   43164: ALF-16404: Create a Tika Powered Metadata Embedder
        - Added constructors for setting of supported embed types to TikaPoweredMetadataExtracter
        - Changed visibility of getInputStream to protected so subclasses can use it
        - Logging level changes in AbstractMappingMetadataExtracter
   43165: ALF-16481: Create a Content Metadata Embedder Action Executer
        - Added ContentMetadataEmbedder action executer which gets an embedder for the noderef if available and sends the content reader and writer for the node ref to the embedder's embed method
        - Added embed-metadata action executer bean
        - Added embed-metadata action executer messages
   43262: ALF-16404: Create a Tika Powered Metadata Embedder
        - Updated Tika which now contains implementation of TIKA-775: Embed Capabilities
   43265: ALF-16404: Create a Tika Powered Metadata Embedder
        - Added MetadataEmbedder implementation to TikaPoweredMetadataExtracter which gets a Tika Embedder and calls its embed method


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@43268 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Ray Gauss
2012-10-31 14:33:09 +00:00
parent 2cb4640004
commit 918696927d
7 changed files with 974 additions and 16 deletions

View File

@@ -540,6 +540,23 @@
</property> </property>
</bean> </bean>
<bean id="embed-metadata" class="org.alfresco.repo.action.executer.ContentMetadataEmbedder" parent="action-executer">
<property name="nodeService">
<ref bean="NodeService" />
</property>
<property name="contentService">
<ref bean="ContentService" />
</property>
<property name="metadataExtracterRegistry">
<ref bean="metadataExtracterRegistry" />
</property>
<property name="applicableTypes">
<list>
<value>{http://www.alfresco.org/model/content/1.0}content</value>
</list>
</property>
</bean>
<bean id="import" class="org.alfresco.repo.action.executer.ImporterActionExecuter" parent="action-executer"> <bean id="import" class="org.alfresco.repo.action.executer.ImporterActionExecuter" parent="action-executer">
<property name="importerService"> <property name="importerService">
<ref bean="ImporterService"/> <ref bean="ImporterService"/>

View File

@@ -148,6 +148,9 @@ import.destination.display-label=Destination
extract-metadata.title=Extract common metadata fields extract-metadata.title=Extract common metadata fields
extract-metadata.description=Imports title, author and description metadata fields from common content types. extract-metadata.description=Imports title, author and description metadata fields from common content types.
embed-metadata.title=Embed properties as metadata in content
embed-metadata.description=This action attempts to embed the content's properties as metadata in the file itself
specialise-type.title=Specialise type specialise-type.title=Specialise type
specialise-type.description=This will specialise the matched item to a given type. specialise-type.description=This will specialise the matched item to a given type.
specialise-type.type-name.display-label=Type specialise-type.type-name.display-label=Type

View File

@@ -0,0 +1,166 @@
/*
* Copyright (C) 2005-2012 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.action.executer;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.action.executer.ActionExecuterAbstractBase;
import org.alfresco.repo.content.metadata.MetadataEmbedder;
import org.alfresco.repo.content.metadata.MetadataExtracterRegistry;
import org.alfresco.service.cmr.action.Action;
import org.alfresco.service.cmr.action.ParameterDefinition;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentService;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.NodeRef;
import org.alfresco.service.cmr.repository.NodeService;
import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Embed metadata in any content.
* <p>
* The metadata is embedded in the content from the current
* property values.
*
* @author Jesper Steen Møller, Ray Gauss II
*/
public class ContentMetadataEmbedder extends ActionExecuterAbstractBase
{
private static Log logger = LogFactory.getLog(ContentMetadataEmbedder.class);
public static final String EXECUTOR_NAME = "embed-metadata";
private NodeService nodeService;
private ContentService contentService;
private MetadataExtracterRegistry metadataExtracterRegistry;
/**
* @param nodeService the node service
*/
public void setNodeService(NodeService nodeService)
{
this.nodeService = nodeService;
}
/**
* @param contentService The contentService to set.
*/
public void setContentService(ContentService contentService)
{
this.contentService = contentService;
}
/**
* @param metadataExtracterRegistry The metadataExtracterRegistry to set.
*/
public void setMetadataExtracterRegistry(MetadataExtracterRegistry metadataExtracterRegistry)
{
this.metadataExtracterRegistry = metadataExtracterRegistry;
}
/**
* @see org.alfresco.repo.action.executer.ActionExecuter#execute(org.alfresco.service.cmr.repository.NodeRef,
* NodeRef)
*/
public void executeImpl(Action ruleAction, NodeRef actionedUponNodeRef)
{
if (!nodeService.exists(actionedUponNodeRef))
{
// Node is gone
return;
}
ContentReader reader = contentService.getReader(actionedUponNodeRef, ContentModel.PROP_CONTENT);
// The reader may be null, e.g. for folders and the like
if (reader == null || reader.getMimetype() == null)
{
if(logger.isDebugEnabled())
{
logger.debug("no content or mimetype - do nothing");
}
// No content to extract data from
return;
}
String mimetype = reader.getMimetype();
MetadataEmbedder embedder = metadataExtracterRegistry.getEmbedder(mimetype);
if (embedder == null)
{
if(logger.isDebugEnabled())
{
logger.debug("no embedder for mimetype:" + mimetype);
}
// There is no embedder to use
return;
}
ContentWriter writer = contentService.getWriter(actionedUponNodeRef, ContentModel.PROP_CONTENT, true);
// The writer may be null, e.g. for folders and the like
if (writer == null || writer.getMimetype() == null)
{
if(logger.isDebugEnabled())
{
logger.debug("no content or mimetype - do nothing");
}
// No content to embed data in
return;
}
// Get all the node's properties
Map<QName, Serializable> nodeProperties = nodeService.getProperties(actionedUponNodeRef);
try
{
embedder.embed(nodeProperties, reader, writer);
}
catch (Throwable e)
{
// Extracters should attempt to handle all error conditions and embed
// as much as they can. If, however, one should fail, we don't want the
// action itself to fail. We absorb and report the exception here.
if (logger.isDebugEnabled())
{
logger.debug(
"Meetadata embedding failed: \n" +
" Extracter: " + this + "\n" +
" Node: " + actionedUponNodeRef + "\n" +
" Content: " + writer,
e);
}
else
{
logger.warn(
"Metadata embedding failed (turn on DEBUG for full error): \n" +
" Extracter: " + this + "\n" +
" Node: " + actionedUponNodeRef + "\n" +
" Content: " + writer + "\n" +
" Failure: " + e.getMessage());
}
}
}
@Override
protected void addParameterDefinitions(List<ParameterDefinition> arg0)
{
// None!
}
}

View File

@@ -42,6 +42,7 @@ import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
import org.alfresco.service.cmr.dictionary.DictionaryService; import org.alfresco.service.cmr.dictionary.DictionaryService;
import org.alfresco.service.cmr.dictionary.PropertyDefinition; import org.alfresco.service.cmr.dictionary.PropertyDefinition;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.MimetypeService; import org.alfresco.service.cmr.repository.MimetypeService;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.cmr.repository.datatype.TypeConversionException; import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
@@ -92,7 +93,7 @@ import org.springframework.extensions.surf.util.ISO8601DateFormat;
* @author Jesper Steen Møller * @author Jesper Steen Møller
* @author Derek Hulley * @author Derek Hulley
*/ */
abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter, MetadataEmbedder
{ {
public static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix."; public static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix.";
private static final String ERR_TYPE_CONVERSION = "metadata.extraction.err.type_conversion"; private static final String ERR_TYPE_CONVERSION = "metadata.extraction.err.type_conversion";
@@ -105,11 +106,14 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
private boolean initialized; private boolean initialized;
private Set<String> supportedMimetypes; private Set<String> supportedMimetypes;
private Set<String> supportedEmbedMimetypes;
private OverwritePolicy overwritePolicy; private OverwritePolicy overwritePolicy;
private boolean failOnTypeConversion; private boolean failOnTypeConversion;
protected Set<DateFormat> supportedDateFormats = new HashSet<DateFormat>(0); protected Set<DateFormat> supportedDateFormats = new HashSet<DateFormat>(0);
private Map<String, Set<QName>> mapping; private Map<String, Set<QName>> mapping;
private Map<QName, Set<String>> embedMapping;
private boolean inheritDefaultMapping; private boolean inheritDefaultMapping;
private boolean inheritDefaultEmbedMapping;
/** /**
* Default constructor. If this is called, then {@link #isSupported(String)} should * Default constructor. If this is called, then {@link #isSupported(String)} should
@@ -137,10 +141,24 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
overwritePolicy = OverwritePolicy.PRAGMATIC; overwritePolicy = OverwritePolicy.PRAGMATIC;
failOnTypeConversion = true; failOnTypeConversion = true;
mapping = null; // The default will be fetched mapping = null; // The default will be fetched
embedMapping = null;
inheritDefaultMapping = false; // Any overrides are complete inheritDefaultMapping = false; // Any overrides are complete
inheritDefaultEmbedMapping = false;
initialized = false; initialized = false;
} }
/**
* Constructor that can be used when the list of supported extract and embed mimetypes is known up front.
*
* @param supportedMimetypes the set of mimetypes supported for extraction by default
* @param supportedEmbedMimetypes the set of mimetypes supported for embedding by default
*/
protected AbstractMappingMetadataExtracter(Set<String> supportedMimetypes, Set<String> supportedEmbedMimetypes)
{
this(supportedMimetypes);
this.supportedEmbedMimetypes = supportedEmbedMimetypes;
}
/** /**
* Set the registry to register with. If this is not set, then the default * Set the registry to register with. If this is not set, then the default
* initialization will not auto-register the extracter for general use. It * initialization will not auto-register the extracter for general use. It
@@ -188,6 +206,17 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
this.supportedMimetypes.addAll(supportedMimetypes); this.supportedMimetypes.addAll(supportedMimetypes);
} }
/**
* Set the mimetypes that are supported for embedding.
*
* @param supportedEmbedMimetypes
*/
public void setSupportedEmbedMimetypes(Collection<String> supportedEmbedMimetypes)
{
this.supportedEmbedMimetypes.clear();
this.supportedEmbedMimetypes.addAll(supportedEmbedMimetypes);
}
/** /**
* {@inheritDoc} * {@inheritDoc}
* *
@@ -198,6 +227,20 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
return supportedMimetypes.contains(sourceMimetype); return supportedMimetypes.contains(sourceMimetype);
} }
/**
* {@inheritDoc}
*
* @see #setSupportedEmbedMimetypes(Collection)
*/
public boolean isEmbeddingSupported(String sourceMimetype)
{
if (supportedEmbedMimetypes == null)
{
return false;
}
return supportedEmbedMimetypes.contains(sourceMimetype);
}
/** /**
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
* @return Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt> * @return Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
@@ -308,6 +351,23 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
this.inheritDefaultMapping = inheritDefaultMapping; this.inheritDefaultMapping = inheritDefaultMapping;
} }
/**
* Set if the embed property mappings augment or override the mapping generically provided by the
* extracter implementation. The default is <tt>false</tt>, i.e. any mapping set completely
* replaces the {@link #getDefaultEmbedMapping() default mappings}.
*
* @param inheritDefaultEmbedMapping <tt>true</tt> to add the configured embed mapping
* to the list of default embed mappings.
*
* @see #getDefaultEmbedMapping()
* @see #setEmbedMapping(Map)
* @see #setEmbedMappingProperties(Properties)
*/
public void setInheritDefaultEmbedMapping(boolean inheritDefaultEmbedMapping)
{
this.inheritDefaultEmbedMapping = inheritDefaultEmbedMapping;
}
/** /**
* Set the mapping from document metadata to system metadata. It is possible to direct * Set the mapping from document metadata to system metadata. It is possible to direct
* an extracted document property to several system properties. The conversion between * an extracted document property to several system properties. The conversion between
@@ -321,6 +381,19 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
this.mapping = mapping; this.mapping = mapping;
} }
/**
* Set the embed mapping from document metadata to system metadata. It is possible to direct
* an model properties to several content file metadata keys. The conversion between
* the model property types and the content file metadata keys types will be done by the
* {@link org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter default converter}.
*
* @param embedMapping an embed mapping from model properties to content file metadata keys
*/
public void setEmbedMapping(Map<QName, Set<String>> embedMapping)
{
this.embedMapping = embedMapping;
}
/** /**
* Set the properties that contain the mapping from document metadata to system metadata. * Set the properties that contain the mapping from document metadata to system metadata.
* This is an alternative to the {@link #setMapping(Map)} method. Any mappings already * This is an alternative to the {@link #setMapping(Map)} method. Any mappings already
@@ -347,6 +420,32 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
mapping = readMappingProperties(mappingProperties); mapping = readMappingProperties(mappingProperties);
} }
/**
* Set the properties that contain the embed mapping from model properties to content file metadata.
* This is an alternative to the {@link #setEmbedMapping(Map)} method. Any mappings already
* present will be cleared out.
*
* The property mapping is of the form:
* <pre>
* # Namespaces prefixes
* namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
* namespace.prefix.my=http://www....com/alfresco/1.0
*
* # Mapping
* cm\:author=editor
* cm\:title=title
* cm\:summary=user1
* cm\:description=description,user2
* </pre>
* The embed mapping can therefore be from a model property onto several content file metadata properties.
*
* @param embedMappingProperties the properties that map model properties to content file metadata properties
*/
public void setEmbedMappingProperties(Properties embedMappingProperties)
{
embedMapping = readEmbedMappingProperties(embedMappingProperties);
}
/** /**
* Helper method for derived classes to obtain the mappings that will be applied to raw * Helper method for derived classes to obtain the mappings that will be applied to raw
* values. This should be called after initialization in order to guarantee the complete * values. This should be called after initialization in order to guarantee the complete
@@ -373,6 +472,28 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
return Collections.unmodifiableMap(mapping); return Collections.unmodifiableMap(mapping);
} }
/**
* Helper method for derived classes to obtain the embed mappings.
* This should be called after initialization in order to guarantee the complete
* map is given.
* <p>
* Normally, the list of properties that can be embedded in a document is fixed and
* well-known.. But some implementations may have
* an extra, indeterminate set of values available for embedding. If the embedding of
* these runtime parameters is expensive, then the keys provided by the return value can
* be used to embed values in the documents. The metadata embedding becomes fully
* configuration-driven, i.e. declaring further mappings will result in more values being
* embedded in the documents.
*/
protected final Map<QName, Set<String>> getEmbedMapping()
{
if (!initialized)
{
throw new UnsupportedOperationException("The complete embed mapping is only available after initialization.");
}
return Collections.unmodifiableMap(embedMapping);
}
/** /**
* A utility method to read mapping properties from a resource file and convert to the map form. * A utility method to read mapping properties from a resource file and convert to the map form.
* *
@@ -490,9 +611,131 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
" Mapping: " + entry); " Mapping: " + entry);
} }
} }
if (logger.isTraceEnabled())
{
logger.trace("Added mapping from " + documentProperty + " to " + qnames);
}
}
// Done
return convertedMapping;
}
/**
* A utility method to read embed mapping properties from a resource file and convert to the map form.
*
* @param propertiesUrl A standard Properties file URL location
*
* @see #setEmbedMappingProperties(Properties)
*/
protected Map<QName, Set<String>> readEmbedMappingProperties(String propertiesUrl)
{
InputStream is = null;
try
{
is = getClass().getClassLoader().getResourceAsStream(propertiesUrl);
if(is == null)
{
return null;
}
Properties props = new Properties();
props.load(is);
// Process it
Map<QName, Set<String>> map = readEmbedMappingProperties(props);
// Done
if (logger.isDebugEnabled()) if (logger.isDebugEnabled())
{ {
logger.debug("Added mapping from " + documentProperty + " to " + qnames); logger.debug("Loaded embed mapping properties from resource: " + propertiesUrl);
}
return map;
}
catch (Throwable e)
{
throw new AlfrescoRuntimeException(
"Unable to load properties file to read extracter embed mapping properties: \n" +
" Extracter: " + this + "\n" +
" Bundle: " + propertiesUrl,
e);
}
finally
{
if (is != null)
{
try { is.close(); } catch (Throwable e) {}
}
}
}
/**
* A utility method to convert mapping properties to the Map form.
* <p>
* Different from readMappingProperties in that keys are the Alfresco QNames
* and values are file metadata properties.
*
* @see #setMappingProperties(Properties)
*/
protected Map<QName, Set<String>> readEmbedMappingProperties(Properties mappingProperties)
{
Map<String, String> namespacesByPrefix = new HashMap<String, String>(5);
// Get the namespaces
for (Map.Entry<Object, Object> entry : mappingProperties.entrySet())
{
String propertyName = (String) entry.getKey();
if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX))
{
String prefix = propertyName.substring(17);
String namespace = (String) entry.getValue();
namespacesByPrefix.put(prefix, namespace);
}
}
// Create the mapping
Map<QName, Set<String>> convertedMapping = new HashMap<QName, Set<String>>(17);
for (Map.Entry<Object, Object> entry : mappingProperties.entrySet())
{
String modelProperty = (String) entry.getKey();
String metadataKeysString = (String) entry.getValue();
if (modelProperty.startsWith(NAMESPACE_PROPERTY_PREFIX))
{
// Ignore these now
continue;
}
int index = modelProperty.indexOf(QName.NAMESPACE_PREFIX);
if (index > -1 && modelProperty.charAt(0) != QName.NAMESPACE_BEGIN)
{
String prefix = modelProperty.substring(0, index);
String suffix = modelProperty.substring(index + 1);
// It is prefixed
String uri = namespacesByPrefix.get(prefix);
if (uri == null)
{
throw new AlfrescoRuntimeException(
"No prefix mapping for embed property mapping: \n" +
" Extracter: " + this + "\n" +
" Mapping: " + entry);
}
modelProperty = QName.NAMESPACE_BEGIN + uri + QName.NAMESPACE_END + suffix;
}
try
{
QName qname = QName.createQName(modelProperty);
String[] metadataKeysArray = metadataKeysString.split(",");
Set<String> metadataKeys = new HashSet<String>(metadataKeysArray.length);
for (String metadataKey : metadataKeysArray) {
metadataKeys.add(metadataKey.trim());
}
// Create the entry
convertedMapping.put(qname, metadataKeys);
}
catch (InvalidQNameException e)
{
throw new AlfrescoRuntimeException(
"Can't create metadata embedding property mapping: \n" +
" Extracter: " + this + "\n" +
" Mapping: " + entry);
}
if (logger.isTraceEnabled())
{
logger.trace("Added mapping from " + modelProperty + " to " + metadataKeysString);
} }
} }
// Done // Done
@@ -560,6 +803,31 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
"There are no property mappings for the metadata extracter.\n" + "There are no property mappings for the metadata extracter.\n" +
" Nothing will be extracted by: " + this); " Nothing will be extracted by: " + this);
} }
Map<QName, Set<String>> defaultEmbedMapping = getDefaultEmbedMapping();
// Was a mapping explicitly provided
if (embedMapping == null)
{
// No mapping, so use the default
embedMapping = defaultEmbedMapping;
}
else if (inheritDefaultEmbedMapping)
{
// Merge the default mapping into the configured mapping
for (QName modelProperty : defaultEmbedMapping.keySet())
{
Set<String> metadataKeys = embedMapping.get(modelProperty);
if (metadataKeys == null)
{
metadataKeys = new HashSet<String>(3);
embedMapping.put(modelProperty, metadataKeys);
}
Set<String> defaultMetadataKeys = defaultEmbedMapping.get(modelProperty);
metadataKeys.addAll(defaultMetadataKeys);
}
}
// Done // Done
initialized = true; initialized = true;
} }
@@ -589,6 +857,25 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
} }
} }
/**
* Checks if embedding for the mimetype is supported.
*
* @param writer the writer to check
* @throws AlfrescoRuntimeException if embedding for the mimetype is not supported
*/
protected void checkIsEmbedSupported(ContentWriter writer)
{
String mimetype = writer.getMimetype();
if (!isSupported(mimetype))
{
throw new AlfrescoRuntimeException(
"Metadata extracter does not support embedding mimetype: \n" +
" writer: " + writer + "\n" +
" supported: " + supportedEmbedMimetypes + "\n" +
" extracter: " + this);
}
}
/** /**
* {@inheritDoc} * {@inheritDoc}
*/ */
@@ -731,6 +1018,102 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
return changedProperties; return changedProperties;
} }
/**
* {@inheritDoc}
*/
public final void embed(
Map<QName, Serializable> properties,
ContentReader reader,
ContentWriter writer)
{
// Done
if (logger.isDebugEnabled())
{
logger.debug("Starting metadata embedding: \n" +
" reader: " + reader + "\n" +
" writer: " + writer + "\n" +
" extracter: " + this);
}
if (!initialized)
{
throw new AlfrescoRuntimeException(
"Metadata extracter not initialized.\n" +
" Call the 'register' method on: " + this + "\n" +
" Implementations of the 'init' method must call the base implementation.");
}
// check the reliability
checkIsEmbedSupported(writer);
try
{
embedInternal(mapSystemToRaw(properties), reader, writer);
if(logger.isDebugEnabled())
{
logger.debug("Embedded Metadata into " + writer);
}
}
catch (Throwable e)
{
// Ask Tika to detect the document, and report back on if
// the current mime type is plausible
String typeErrorMessage = null;
String differentType = null;
if(mimetypeService != null)
{
differentType = mimetypeService.getMimetypeIfNotMatches(writer.getReader());
}
else
{
logger.info("Unable to verify mimetype of " + writer.getReader() +
" as no MimetypeService available to " + getClass().getName());
}
if(differentType != null)
{
typeErrorMessage = "\n" +
" claimed mime type: " + writer.getMimetype() + "\n" +
" detected mime type: " + differentType;
}
if (logger.isDebugEnabled())
{
logger.debug(
"Metadata embedding failed: \n" +
" Extracter: " + this + "\n" +
" Content: " + writer +
typeErrorMessage,
e);
}
else
{
logger.warn(
"Metadata embedding failed (turn on DEBUG for full error): \n" +
" Extracter: " + this + "\n" +
" Content: " + writer + "\n" +
" Failure: " + e.getMessage() +
typeErrorMessage);
}
}
finally
{
// check that the writer was closed (if used)
if (writer.isChannelOpen())
{
logger.error("Content writer not closed by metadata extracter: \n" +
" writer: " + writer + "\n" +
" extracter: " + this);
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug("Completed metadata embedding: \n" +
" writer: " + writer + "\n" +
" extracter: " + this);
}
}
/** /**
* *
* @param rawMetadata Metadata keyed by document properties * @param rawMetadata Metadata keyed by document properties
@@ -766,6 +1149,41 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
return systemProperties; return systemProperties;
} }
/**
*
* @param systemMetadata Metadata keyed by system properties
* @return Returns the metadata keyed by the content file metadata properties
*/
private Map<String, Serializable> mapSystemToRaw(Map<QName, Serializable> systemMetadata)
{
Map<String, Serializable> metadataProperties = new HashMap<String, Serializable>(systemMetadata.size() * 2 + 1);
for (Map.Entry<QName, Serializable> entry : systemMetadata.entrySet())
{
QName modelProperty = entry.getKey();
// Check if there is a mapping for this
if (!embedMapping.containsKey(modelProperty))
{
// No mapping - ignore
continue;
}
Serializable documentValue = entry.getValue();
Set<String> metadataKeys = embedMapping.get(modelProperty);
for (String metadataKey : metadataKeys)
{
metadataProperties.put(metadataKey, documentValue);
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug(
"Converted system model values to metadata values: \n" +
" System Properties: " + systemMetadata + "\n" +
" Metadata Properties: " + metadataProperties);
}
return metadataProperties;
}
/** /**
* Filters the system properties that are going to be applied. Gives the metadata extracter an * Filters the system properties that are going to be applied. Gives the metadata extracter an
* opportunity to remove properties that may not be appropriate in a given context. * opportunity to remove properties that may not be appropriate in a given context.
@@ -1055,6 +1473,90 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
return readMappingProperties(propertiesUrl); return readMappingProperties(propertiesUrl);
} }
/**
* This method provides a <i>best guess</i> of what model properties should be embedded
* in content. The list of properties mapped by default need <b>not</b>
* include all properties to be embedded in the document; just the obvious set of mappings
* need be supplied.
* Implementations must either provide the default mapping properties in the expected
* location or override the method to provide the default mapping.
* <p>
* The default implementation looks for the default mapping file in the location
* given by the class name and <i>.embed.properties</i>. If the extracter's class is
* <b>x.y.z.MyExtracter</b> then the default properties will be picked up at
* <b>classpath:/x/y/z/MyExtracter.embed.properties</b>.
* Inner classes are supported, but the '$' in the class name is replaced with '-', so
* default properties for <b>x.y.z.MyStuff$MyExtracter</b> will be located using
* <b>x.y.z.MyStuff-MyExtracter.embed.properties</b>.
* <p>
* The default mapping implementation should include thorough Javadocs so that the
* system administrators can accurately determine how to best enhance or override the
* default mapping.
* <p>
* If the default mapping is declared in a properties file other than the one named after
* the class, then the {@link #readEmbedMappingProperties(String)} method can be used to quickly
* generate the return value:
* <pre><code>
* protected Map<<String, Set<QName>> getDefaultMapping()
* {
* return readEmbedMappingProperties(DEFAULT_MAPPING);
* }
* </code></pre>
* The map can also be created in code either statically or during the call.
* <p>
* If no embed mapping properties file is found a reverse of the extract
* mapping in {@link #getDefaultMapping()} will be assumed with the first QName in each
* value used as the key for this mapping and a last win approach for duplicates.
*
* @return Returns the default, static embed mapping. It may not be null.
*
* @see #setInheritDefaultMapping(boolean inherit)
*/
protected Map<QName, Set<String>> getDefaultEmbedMapping()
{
String className = this.getClass().getName();
// Replace $
className = className.replace('$', '-');
// Replace .
className = className.replace('.', '/');
// Append .properties
String propertiesUrl = className + ".embed.properties";
// Attempt to load the properties
Map<QName, Set<String>> embedMapping = readEmbedMappingProperties(propertiesUrl);
if (embedMapping == null)
{
if (logger.isDebugEnabled())
{
logger.debug("No explicit embed mapping properties found at: " + propertiesUrl + ", assuming reverse of extract mapping");
}
Map<String, Set<QName>> extractMapping = this.mapping;
if (extractMapping == null || extractMapping.size() == 0)
{
extractMapping = getDefaultMapping();
}
embedMapping = new HashMap<QName, Set<String>>(extractMapping.size());
for (String metadataKey : extractMapping.keySet())
{
if (extractMapping.get(metadataKey) != null && extractMapping.get(metadataKey).size() > 0)
{
QName modelProperty = extractMapping.get(metadataKey).iterator().next();
Set<String> metadataKeys = embedMapping.get(modelProperty);
if (metadataKeys == null)
{
metadataKeys = new HashSet<String>(1);
embedMapping.put(modelProperty, metadataKeys);
}
metadataKeys.add(metadataKey);
if (logger.isTraceEnabled())
{
logger.trace("Added mapping from " + modelProperty + " to " + metadataKeys.toString());
}
}
}
}
return embedMapping;
}
/** /**
* Override to provide the raw extracted metadata values. An extracter should extract * Override to provide the raw extracted metadata values. An extracter should extract
* as many of the available properties as is realistically possible. Even if the * as many of the available properties as is realistically possible. Even if the
@@ -1089,4 +1591,25 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
* @see #getDefaultMapping() * @see #getDefaultMapping()
*/ */
protected abstract Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable; protected abstract Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable;
/**
* Override to embed metadata values. An extracter should embed
* as many of the available properties as is realistically possible. Even if the
* {@link #getDefaultEmbedMapping() default mapping} doesn't handle all properties, it is
* possible for each instance of the extracter to be configured differently and more or
* less of the properties may be used in different installations.
*
* @param metadata the metadata keys and values to embed in the content file
* @param reader the reader for the original document. This stream provided by
* the reader must be closed if accessed directly.
* @param writer the writer for the document to embed the values in. This stream provided by
* the writer must be closed if accessed directly.
* @throws All exception conditions can be handled.
*
* @see #getDefaultEmbedMapping()
*/
protected void embedInternal(Map<String, Serializable> metadata, ContentReader reader, ContentWriter writer) throws Throwable
{
// TODO make this an abstract method once more extracters support embedding
}
} }

View File

@@ -0,0 +1,64 @@
/*
* Copyright (C) 2005-2012 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Map;
import org.alfresco.repo.content.ContentWorker;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.namespace.QName;
/**
* Interface for writing metadata properties back into the content file.
*
* @author Ray Gauss II
*
*/
public interface MetadataEmbedder extends ContentWorker {
/**
* Determines if the extracter works against the given mimetype.
*
* @param mimetype the document mimetype
* @return Returns <tt>true</tt> if the mimetype is supported, otherwise <tt>false</tt>.
*/
public boolean isEmbeddingSupported(String mimetype);
/**
* Embeds the given properties into the file specified by the given content writer.
* * <p>
* The embedding viability can be determined by an up front call to
* {@link #isSupported(String)}.
* <p>
* The source mimetype <b>must</b> be available on the
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
* of the writer.
*
* @param properties the model properties to embed
* @param reader the reader for the original source content file
* @param writer the writer for the content after metadata has been embedded
* @throws ContentIOException
*/
public void embed(Map<QName, Serializable> properties, ContentReader reader, ContentWriter writer) throws ContentIOException;
}

View File

@@ -44,6 +44,7 @@ public class MetadataExtracterRegistry
private List<MetadataExtracter> extracters; private List<MetadataExtracter> extracters;
private Map<String, List<MetadataExtracter>> extracterCache; private Map<String, List<MetadataExtracter>> extracterCache;
private Map<String, List<MetadataEmbedder>> embedderCache;
/** Controls read access to the cache */ /** Controls read access to the cache */
private Lock extracterCacheReadLock; private Lock extracterCacheReadLock;
@@ -55,6 +56,7 @@ public class MetadataExtracterRegistry
// initialise lists // initialise lists
extracters = new ArrayList<MetadataExtracter>(10); extracters = new ArrayList<MetadataExtracter>(10);
extracterCache = new HashMap<String, List<MetadataExtracter>>(17); extracterCache = new HashMap<String, List<MetadataExtracter>>(17);
embedderCache = new HashMap<String, List<MetadataEmbedder>>(17);
// create lock objects for access to the cache // create lock objects for access to the cache
ReadWriteLock extractionCacheLock = new ReentrantReadWriteLock(); ReadWriteLock extractionCacheLock = new ReentrantReadWriteLock();
@@ -72,6 +74,7 @@ public class MetadataExtracterRegistry
try try
{ {
extracterCache.clear(); extracterCache.clear();
embedderCache.clear();
} }
finally finally
{ {
@@ -96,6 +99,7 @@ public class MetadataExtracterRegistry
{ {
extracters.add(extracter); extracters.add(extracter);
extracterCache.clear(); extracterCache.clear();
embedderCache.clear();
} }
finally finally
{ {
@@ -186,4 +190,92 @@ public class MetadataExtracterRegistry
} }
return extractors; return extractors;
} }
/**
* Gets the best metadata embedder. This is a combination of the most
* reliable and the most performant embedder.
* <p>
* The result is cached for quicker access next time.
*
* @param mimetype the source MIME of the extraction
* @return Returns a metadata embedder that can embed metadata in the
* chosen MIME type.
*/
public MetadataEmbedder getEmbedder(String sourceMimetype)
{
List<MetadataEmbedder> embedders = null;
extracterCacheReadLock.lock();
try
{
if (embedderCache.containsKey(sourceMimetype))
{
// the translation has been requested before
// it might have been null
embedders = embedderCache.get(sourceMimetype);
}
}
finally
{
extracterCacheReadLock.unlock();
}
if (embedders == null)
{
// No request has been made before
// Get a write lock on the cache
// No double check done as it is not an expensive task
extracterCacheWriteLock.lock();
try
{
// find the most suitable transformer - may be empty list
embedders = findBestEmbedders(sourceMimetype);
// store the result even if it is null
embedderCache.put(sourceMimetype, embedders);
}
finally
{
extracterCacheWriteLock.unlock();
}
}
// We have the list of embedders that supposedly work (as registered).
// Take the last one that still claims to work
MetadataEmbedder liveEmbedder = null;
for (MetadataEmbedder embedder : embedders)
{
// An extractor may dynamically become unavailable
if (!embedder.isEmbeddingSupported(sourceMimetype))
{
continue;
}
liveEmbedder = embedder;
}
return liveEmbedder;
}
/**
* @param sourceMimetype The MIME type under examination
* @return Returns a set of embedders that will work for the given mimetype
*/
private List<MetadataEmbedder> findBestEmbedders(String sourceMimetype)
{
logger.debug("Finding embedders for " + sourceMimetype);
List<MetadataEmbedder> embedders = new ArrayList<MetadataEmbedder>(1);
for (MetadataExtracter extractor : extracters)
{
if (!(extractor instanceof MetadataEmbedder))
{
continue;
}
if (!((MetadataEmbedder) extractor).isEmbeddingSupported(sourceMimetype))
{
// extraction not achievable
continue;
}
embedders.add((MetadataEmbedder)extractor);
}
return embedders;
}
} }

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2005-2010 Alfresco Software Limited. * Copyright (C) 2005-2012 Alfresco Software Limited.
* *
* This file is part of Alfresco * This file is part of Alfresco
* *
@@ -20,11 +20,13 @@ package org.alfresco.repo.content.metadata;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable; import java.io.Serializable;
import java.text.DateFormat; import java.text.DateFormat;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@@ -35,8 +37,12 @@ import java.util.TimeZone;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.filestore.FileContentReader; import org.alfresco.repo.content.filestore.FileContentReader;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.tika.embedder.Embedder;
import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream; import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
@@ -72,7 +78,9 @@ import org.xml.sax.SAXException;
* @since 3.4 * @since 3.4
* @author Nick Burch * @author Nick Burch
*/ */
public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetadataExtracter public abstract class TikaPoweredMetadataExtracter
extends AbstractMappingMetadataExtracter
implements MetadataEmbedder
{ {
protected static Log logger = LogFactory.getLog(TikaPoweredMetadataExtracter.class); protected static Log logger = LogFactory.getLog(TikaPoweredMetadataExtracter.class);
@@ -118,11 +126,19 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
public TikaPoweredMetadataExtracter(ArrayList<String> supportedMimeTypes) public TikaPoweredMetadataExtracter(ArrayList<String> supportedMimeTypes)
{ {
this(new HashSet<String>(supportedMimeTypes)); this(new HashSet<String>(supportedMimeTypes), null);
}
public TikaPoweredMetadataExtracter(ArrayList<String> supportedMimeTypes, ArrayList<String> supportedEmbedMimeTypes)
{
this(new HashSet<String>(supportedMimeTypes), new HashSet<String>(supportedEmbedMimeTypes));
} }
public TikaPoweredMetadataExtracter(HashSet<String> supportedMimeTypes) public TikaPoweredMetadataExtracter(HashSet<String> supportedMimeTypes)
{ {
super(supportedMimeTypes); this(supportedMimeTypes, null);
}
public TikaPoweredMetadataExtracter(HashSet<String> supportedMimeTypes, HashSet<String> supportedEmbedMimeTypes)
{
super(supportedMimeTypes, supportedEmbedMimeTypes);
// TODO Once TIKA-451 is fixed this list will get nicer // TODO Once TIKA-451 is fixed this list will get nicer
this.tikaDateFormats = new DateFormat[] { this.tikaDateFormats = new DateFormat[] {
@@ -188,6 +204,18 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
*/ */
protected abstract Parser getParser(); protected abstract Parser getParser();
/**
* Returns the Tika Embedder to modify
* the document.
*
* @return the Tika embedder
*/
protected Embedder getEmbedder()
{
// TODO make this an abstract method once more extracters support embedding
return null;
}
/** /**
* Do we care about the contents of the * Do we care about the contents of the
* extracted header, or nothing at all? * extracted header, or nothing at all?
@@ -215,7 +243,7 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
* For these cases, buffer out to a local file if not * For these cases, buffer out to a local file if not
* already there * already there
*/ */
private InputStream getInputStream(ContentReader reader) throws IOException { protected InputStream getInputStream(ContentReader reader) throws IOException {
// Prefer the File if available, it's generally quicker // Prefer the File if available, it's generally quicker
if(reader instanceof FileContentReader) if(reader instanceof FileContentReader)
{ {
@@ -338,6 +366,71 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
return rawProperties; return rawProperties;
} }
@Override
protected void embedInternal(Map<String, Serializable> properties, ContentReader reader, ContentWriter writer) throws Throwable
{
Embedder embedder = getEmbedder();
if (embedder == null)
{
return;
}
OutputStream outputStream = null;
try
{
Metadata metadataToEmbed = new Metadata();
for (String metadataKey : properties.keySet())
{
Serializable value = properties.get(metadataKey);
if (value == null)
{
continue;
}
if (value instanceof Collection<?>)
{
for (Object singleValue : (Collection<?>) value)
{
try
{
// Convert to a string value for Tika
metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, singleValue));
}
catch (TypeConversionException e)
{
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
}
}
}
else
{
try
{
// Convert to a string value for Tika
metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, value));
}
catch (TypeConversionException e)
{
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
}
}
}
InputStream inputStream = getInputStream(reader);
outputStream = writer.getContentOutputStream();
embedder.embed(metadataToEmbed, inputStream, outputStream, null);
}
catch (Exception e)
{
logger.error(e.getMessage(), e);
}
finally
{
if (outputStream != null)
{
try { outputStream.close(); } catch (Throwable e) {}
}
}
}
/** /**
* This content handler will capture entries from within * This content handler will capture entries from within
* the header of the Tika content XHTML, but ignore the * the header of the Tika content XHTML, but ignore the