diff --git a/config/alfresco/action-services-context.xml b/config/alfresco/action-services-context.xml index 7bbe04d743..249ebbebb0 100644 --- a/config/alfresco/action-services-context.xml +++ b/config/alfresco/action-services-context.xml @@ -539,6 +539,23 @@ true + + + + + + + + + + + + + + {http://www.alfresco.org/model/content/1.0}content + + + diff --git a/config/alfresco/messages/action-config.properties b/config/alfresco/messages/action-config.properties index 8ac3011581..c577fc88db 100644 --- a/config/alfresco/messages/action-config.properties +++ b/config/alfresco/messages/action-config.properties @@ -148,6 +148,9 @@ import.destination.display-label=Destination extract-metadata.title=Extract common metadata fields extract-metadata.description=Imports title, author and description metadata fields from common content types. +embed-metadata.title=Embed properties as metadata in content +embed-metadata.description=This action attempts to embed the content's properties as metadata in the file itself + specialise-type.title=Specialise type specialise-type.description=This will specialise the matched item to a given type. specialise-type.type-name.display-label=Type diff --git a/source/java/org/alfresco/repo/action/executer/ContentMetadataEmbedder.java b/source/java/org/alfresco/repo/action/executer/ContentMetadataEmbedder.java new file mode 100644 index 0000000000..be59ff45be --- /dev/null +++ b/source/java/org/alfresco/repo/action/executer/ContentMetadataEmbedder.java @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2005-2012 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.action.executer; + +import java.io.Serializable; +import java.util.List; +import java.util.Map; + +import org.alfresco.model.ContentModel; +import org.alfresco.repo.action.executer.ActionExecuterAbstractBase; +import org.alfresco.repo.content.metadata.MetadataEmbedder; +import org.alfresco.repo.content.metadata.MetadataExtracterRegistry; +import org.alfresco.service.cmr.action.Action; +import org.alfresco.service.cmr.action.ParameterDefinition; +import org.alfresco.service.cmr.repository.ContentReader; +import org.alfresco.service.cmr.repository.ContentService; +import org.alfresco.service.cmr.repository.ContentWriter; +import org.alfresco.service.cmr.repository.NodeRef; +import org.alfresco.service.cmr.repository.NodeService; +import org.alfresco.service.namespace.QName; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * Embed metadata in any content. + *

+ * The metadata is embedded in the content from the current + * property values. + * + * @author Jesper Steen Møller, Ray Gauss II + */ +public class ContentMetadataEmbedder extends ActionExecuterAbstractBase +{ + private static Log logger = LogFactory.getLog(ContentMetadataEmbedder.class); + + public static final String EXECUTOR_NAME = "embed-metadata"; + + private NodeService nodeService; + private ContentService contentService; + private MetadataExtracterRegistry metadataExtracterRegistry; + + /** + * @param nodeService the node service + */ + public void setNodeService(NodeService nodeService) + { + this.nodeService = nodeService; + } + + /** + * @param contentService The contentService to set. + */ + public void setContentService(ContentService contentService) + { + this.contentService = contentService; + } + + /** + * @param metadataExtracterRegistry The metadataExtracterRegistry to set. + */ + public void setMetadataExtracterRegistry(MetadataExtracterRegistry metadataExtracterRegistry) + { + this.metadataExtracterRegistry = metadataExtracterRegistry; + } + + /** + * @see org.alfresco.repo.action.executer.ActionExecuter#execute(org.alfresco.service.cmr.repository.NodeRef, + * NodeRef) + */ + public void executeImpl(Action ruleAction, NodeRef actionedUponNodeRef) + { + if (!nodeService.exists(actionedUponNodeRef)) + { + // Node is gone + return; + } + ContentReader reader = contentService.getReader(actionedUponNodeRef, ContentModel.PROP_CONTENT); + // The reader may be null, e.g. for folders and the like + if (reader == null || reader.getMimetype() == null) + { + if(logger.isDebugEnabled()) + { + logger.debug("no content or mimetype - do nothing"); + } + // No content to extract data from + return; + } + String mimetype = reader.getMimetype(); + MetadataEmbedder embedder = metadataExtracterRegistry.getEmbedder(mimetype); + if (embedder == null) + { + if(logger.isDebugEnabled()) + { + logger.debug("no embedder for mimetype:" + mimetype); + } + // There is no embedder to use + return; + } + + ContentWriter writer = contentService.getWriter(actionedUponNodeRef, ContentModel.PROP_CONTENT, true); + // The writer may be null, e.g. for folders and the like + if (writer == null || writer.getMimetype() == null) + { + if(logger.isDebugEnabled()) + { + logger.debug("no content or mimetype - do nothing"); + } + // No content to embed data in + return; + } + + // Get all the node's properties + Map nodeProperties = nodeService.getProperties(actionedUponNodeRef); + + try + { + embedder.embed(nodeProperties, reader, writer); + } + catch (Throwable e) + { + // Extracters should attempt to handle all error conditions and embed + // as much as they can. If, however, one should fail, we don't want the + // action itself to fail. We absorb and report the exception here. + if (logger.isDebugEnabled()) + { + logger.debug( + "Meetadata embedding failed: \n" + + " Extracter: " + this + "\n" + + " Node: " + actionedUponNodeRef + "\n" + + " Content: " + writer, + e); + } + else + { + logger.warn( + "Metadata embedding failed (turn on DEBUG for full error): \n" + + " Extracter: " + this + "\n" + + " Node: " + actionedUponNodeRef + "\n" + + " Content: " + writer + "\n" + + " Failure: " + e.getMessage()); + } + } + } + + @Override + protected void addParameterDefinitions(List arg0) + { + // None! + } +} diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java index dc14a650a7..959524e234 100644 --- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java @@ -42,6 +42,7 @@ import org.alfresco.service.cmr.dictionary.DataTypeDefinition; import org.alfresco.service.cmr.dictionary.DictionaryService; import org.alfresco.service.cmr.dictionary.PropertyDefinition; import org.alfresco.service.cmr.repository.ContentReader; +import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.MimetypeService; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.cmr.repository.datatype.TypeConversionException; @@ -92,7 +93,7 @@ import org.springframework.extensions.surf.util.ISO8601DateFormat; * @author Jesper Steen Møller * @author Derek Hulley */ -abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter +abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter, MetadataEmbedder { public static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix."; private static final String ERR_TYPE_CONVERSION = "metadata.extraction.err.type_conversion"; @@ -105,11 +106,14 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac private boolean initialized; private Set supportedMimetypes; + private Set supportedEmbedMimetypes; private OverwritePolicy overwritePolicy; private boolean failOnTypeConversion; protected Set supportedDateFormats = new HashSet(0); private Map> mapping; + private Map> embedMapping; private boolean inheritDefaultMapping; + private boolean inheritDefaultEmbedMapping; /** * Default constructor. If this is called, then {@link #isSupported(String)} should @@ -137,10 +141,24 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac overwritePolicy = OverwritePolicy.PRAGMATIC; failOnTypeConversion = true; mapping = null; // The default will be fetched + embedMapping = null; inheritDefaultMapping = false; // Any overrides are complete + inheritDefaultEmbedMapping = false; initialized = false; } + /** + * Constructor that can be used when the list of supported extract and embed mimetypes is known up front. + * + * @param supportedMimetypes the set of mimetypes supported for extraction by default + * @param supportedEmbedMimetypes the set of mimetypes supported for embedding by default + */ + protected AbstractMappingMetadataExtracter(Set supportedMimetypes, Set supportedEmbedMimetypes) + { + this(supportedMimetypes); + this.supportedEmbedMimetypes = supportedEmbedMimetypes; + } + /** * Set the registry to register with. If this is not set, then the default * initialization will not auto-register the extracter for general use. It @@ -187,7 +205,18 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac this.supportedMimetypes.clear(); this.supportedMimetypes.addAll(supportedMimetypes); } - + + /** + * Set the mimetypes that are supported for embedding. + * + * @param supportedEmbedMimetypes + */ + public void setSupportedEmbedMimetypes(Collection supportedEmbedMimetypes) + { + this.supportedEmbedMimetypes.clear(); + this.supportedEmbedMimetypes.addAll(supportedEmbedMimetypes); + } + /** * {@inheritDoc} * @@ -197,7 +226,21 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac { return supportedMimetypes.contains(sourceMimetype); } - + + /** + * {@inheritDoc} + * + * @see #setSupportedEmbedMimetypes(Collection) + */ + public boolean isEmbeddingSupported(String sourceMimetype) + { + if (supportedEmbedMimetypes == null) + { + return false; + } + return supportedEmbedMimetypes.contains(sourceMimetype); + } + /** * TODO - This doesn't appear to be used, so should be removed / deprecated / replaced * @return Returns 1.0 if the mimetype is supported, otherwise 0.0 @@ -308,6 +351,23 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac this.inheritDefaultMapping = inheritDefaultMapping; } + /** + * Set if the embed property mappings augment or override the mapping generically provided by the + * extracter implementation. The default is false, i.e. any mapping set completely + * replaces the {@link #getDefaultEmbedMapping() default mappings}. + * + * @param inheritDefaultEmbedMapping true to add the configured embed mapping + * to the list of default embed mappings. + * + * @see #getDefaultEmbedMapping() + * @see #setEmbedMapping(Map) + * @see #setEmbedMappingProperties(Properties) + */ + public void setInheritDefaultEmbedMapping(boolean inheritDefaultEmbedMapping) + { + this.inheritDefaultEmbedMapping = inheritDefaultEmbedMapping; + } + /** * Set the mapping from document metadata to system metadata. It is possible to direct * an extracted document property to several system properties. The conversion between @@ -321,6 +381,19 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac this.mapping = mapping; } + /** + * Set the embed mapping from document metadata to system metadata. It is possible to direct + * an model properties to several content file metadata keys. The conversion between + * the model property types and the content file metadata keys types will be done by the + * {@link org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter default converter}. + * + * @param embedMapping an embed mapping from model properties to content file metadata keys + */ + public void setEmbedMapping(Map> embedMapping) + { + this.embedMapping = embedMapping; + } + /** * Set the properties that contain the mapping from document metadata to system metadata. * This is an alternative to the {@link #setMapping(Map)} method. Any mappings already @@ -346,7 +419,33 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac { mapping = readMappingProperties(mappingProperties); } - + + /** + * Set the properties that contain the embed mapping from model properties to content file metadata. + * This is an alternative to the {@link #setEmbedMapping(Map)} method. Any mappings already + * present will be cleared out. + * + * The property mapping is of the form: + *

+     * # Namespaces prefixes
+     * namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+     * namespace.prefix.my=http://www....com/alfresco/1.0
+     *
+     * # Mapping
+     * cm\:author=editor
+     * cm\:title=title
+     * cm\:summary=user1
+     * cm\:description=description,user2
+     * 
+ * The embed mapping can therefore be from a model property onto several content file metadata properties. + * + * @param embedMappingProperties the properties that map model properties to content file metadata properties + */ + public void setEmbedMappingProperties(Properties embedMappingProperties) + { + embedMapping = readEmbedMappingProperties(embedMappingProperties); + } + /** * Helper method for derived classes to obtain the mappings that will be applied to raw * values. This should be called after initialization in order to guarantee the complete @@ -372,7 +471,29 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } return Collections.unmodifiableMap(mapping); } - + + /** + * Helper method for derived classes to obtain the embed mappings. + * This should be called after initialization in order to guarantee the complete + * map is given. + *

+ * Normally, the list of properties that can be embedded in a document is fixed and + * well-known.. But some implementations may have + * an extra, indeterminate set of values available for embedding. If the embedding of + * these runtime parameters is expensive, then the keys provided by the return value can + * be used to embed values in the documents. The metadata embedding becomes fully + * configuration-driven, i.e. declaring further mappings will result in more values being + * embedded in the documents. + */ + protected final Map> getEmbedMapping() + { + if (!initialized) + { + throw new UnsupportedOperationException("The complete embed mapping is only available after initialization."); + } + return Collections.unmodifiableMap(embedMapping); + } + /** * A utility method to read mapping properties from a resource file and convert to the map form. * @@ -490,15 +611,137 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac " Mapping: " + entry); } } - if (logger.isDebugEnabled()) + if (logger.isTraceEnabled()) { - logger.debug("Added mapping from " + documentProperty + " to " + qnames); + logger.trace("Added mapping from " + documentProperty + " to " + qnames); } } // Done return convertedMapping; } - + + /** + * A utility method to read embed mapping properties from a resource file and convert to the map form. + * + * @param propertiesUrl A standard Properties file URL location + * + * @see #setEmbedMappingProperties(Properties) + */ + protected Map> readEmbedMappingProperties(String propertiesUrl) + { + InputStream is = null; + try + { + is = getClass().getClassLoader().getResourceAsStream(propertiesUrl); + if(is == null) + { + return null; + } + Properties props = new Properties(); + props.load(is); + // Process it + Map> map = readEmbedMappingProperties(props); + // Done + if (logger.isDebugEnabled()) + { + logger.debug("Loaded embed mapping properties from resource: " + propertiesUrl); + } + return map; + } + catch (Throwable e) + { + throw new AlfrescoRuntimeException( + "Unable to load properties file to read extracter embed mapping properties: \n" + + " Extracter: " + this + "\n" + + " Bundle: " + propertiesUrl, + e); + } + finally + { + if (is != null) + { + try { is.close(); } catch (Throwable e) {} + } + } + } + + /** + * A utility method to convert mapping properties to the Map form. + *

+ * Different from readMappingProperties in that keys are the Alfresco QNames + * and values are file metadata properties. + * + * @see #setMappingProperties(Properties) + */ + protected Map> readEmbedMappingProperties(Properties mappingProperties) + { + Map namespacesByPrefix = new HashMap(5); + // Get the namespaces + for (Map.Entry entry : mappingProperties.entrySet()) + { + String propertyName = (String) entry.getKey(); + if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX)) + { + String prefix = propertyName.substring(17); + String namespace = (String) entry.getValue(); + namespacesByPrefix.put(prefix, namespace); + } + } + // Create the mapping + Map> convertedMapping = new HashMap>(17); + for (Map.Entry entry : mappingProperties.entrySet()) + { + String modelProperty = (String) entry.getKey(); + String metadataKeysString = (String) entry.getValue(); + if (modelProperty.startsWith(NAMESPACE_PROPERTY_PREFIX)) + { + // Ignore these now + continue; + } + + int index = modelProperty.indexOf(QName.NAMESPACE_PREFIX); + if (index > -1 && modelProperty.charAt(0) != QName.NAMESPACE_BEGIN) + { + String prefix = modelProperty.substring(0, index); + String suffix = modelProperty.substring(index + 1); + // It is prefixed + String uri = namespacesByPrefix.get(prefix); + if (uri == null) + { + throw new AlfrescoRuntimeException( + "No prefix mapping for embed property mapping: \n" + + " Extracter: " + this + "\n" + + " Mapping: " + entry); + } + modelProperty = QName.NAMESPACE_BEGIN + uri + QName.NAMESPACE_END + suffix; + } + try + { + QName qname = QName.createQName(modelProperty); + String[] metadataKeysArray = metadataKeysString.split(","); + Set metadataKeys = new HashSet(metadataKeysArray.length); + for (String metadataKey : metadataKeysArray) { + metadataKeys.add(metadataKey.trim()); + } + // Create the entry + convertedMapping.put(qname, metadataKeys); + } + catch (InvalidQNameException e) + { + throw new AlfrescoRuntimeException( + "Can't create metadata embedding property mapping: \n" + + " Extracter: " + this + "\n" + + " Mapping: " + entry); + } + if (logger.isTraceEnabled()) + { + logger.trace("Added mapping from " + modelProperty + " to " + metadataKeysString); + } + } + // Done + return convertedMapping; + } + /** * Registers this instance of the extracter with the registry. This will call the * {@link #init()} method and then register if the registry is available. @@ -560,6 +803,31 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac "There are no property mappings for the metadata extracter.\n" + " Nothing will be extracted by: " + this); } + + Map> defaultEmbedMapping = getDefaultEmbedMapping(); + + // Was a mapping explicitly provided + if (embedMapping == null) + { + // No mapping, so use the default + embedMapping = defaultEmbedMapping; + } + else if (inheritDefaultEmbedMapping) + { + // Merge the default mapping into the configured mapping + for (QName modelProperty : defaultEmbedMapping.keySet()) + { + Set metadataKeys = embedMapping.get(modelProperty); + if (metadataKeys == null) + { + metadataKeys = new HashSet(3); + embedMapping.put(modelProperty, metadataKeys); + } + Set defaultMetadataKeys = defaultEmbedMapping.get(modelProperty); + metadataKeys.addAll(defaultMetadataKeys); + } + } + // Done initialized = true; } @@ -589,6 +857,25 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } } + /** + * Checks if embedding for the mimetype is supported. + * + * @param writer the writer to check + * @throws AlfrescoRuntimeException if embedding for the mimetype is not supported + */ + protected void checkIsEmbedSupported(ContentWriter writer) + { + String mimetype = writer.getMimetype(); + if (!isSupported(mimetype)) + { + throw new AlfrescoRuntimeException( + "Metadata extracter does not support embedding mimetype: \n" + + " writer: " + writer + "\n" + + " supported: " + supportedEmbedMimetypes + "\n" + + " extracter: " + this); + } + } + /** * {@inheritDoc} */ @@ -730,7 +1017,103 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } return changedProperties; } - + + /** + * {@inheritDoc} + */ + public final void embed( + Map properties, + ContentReader reader, + ContentWriter writer) + { + // Done + if (logger.isDebugEnabled()) + { + logger.debug("Starting metadata embedding: \n" + + " reader: " + reader + "\n" + + " writer: " + writer + "\n" + + " extracter: " + this); + } + + if (!initialized) + { + throw new AlfrescoRuntimeException( + "Metadata extracter not initialized.\n" + + " Call the 'register' method on: " + this + "\n" + + " Implementations of the 'init' method must call the base implementation."); + } + // check the reliability + checkIsEmbedSupported(writer); + + try + { + embedInternal(mapSystemToRaw(properties), reader, writer); + if(logger.isDebugEnabled()) + { + logger.debug("Embedded Metadata into " + writer); + } + } + catch (Throwable e) + { + // Ask Tika to detect the document, and report back on if + // the current mime type is plausible + String typeErrorMessage = null; + String differentType = null; + if(mimetypeService != null) + { + differentType = mimetypeService.getMimetypeIfNotMatches(writer.getReader()); + } + else + { + logger.info("Unable to verify mimetype of " + writer.getReader() + + " as no MimetypeService available to " + getClass().getName()); + } + if(differentType != null) + { + typeErrorMessage = "\n" + + " claimed mime type: " + writer.getMimetype() + "\n" + + " detected mime type: " + differentType; + } + + if (logger.isDebugEnabled()) + { + logger.debug( + "Metadata embedding failed: \n" + + " Extracter: " + this + "\n" + + " Content: " + writer + + typeErrorMessage, + e); + } + else + { + logger.warn( + "Metadata embedding failed (turn on DEBUG for full error): \n" + + " Extracter: " + this + "\n" + + " Content: " + writer + "\n" + + " Failure: " + e.getMessage() + + typeErrorMessage); + } + } + finally + { + // check that the writer was closed (if used) + if (writer.isChannelOpen()) + { + logger.error("Content writer not closed by metadata extracter: \n" + + " writer: " + writer + "\n" + + " extracter: " + this); + } + } + + // Done + if (logger.isDebugEnabled()) + { + logger.debug("Completed metadata embedding: \n" + + " writer: " + writer + "\n" + + " extracter: " + this); + } + } + /** * * @param rawMetadata Metadata keyed by document properties @@ -765,7 +1148,42 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } return systemProperties; } - + + /** + * + * @param systemMetadata Metadata keyed by system properties + * @return Returns the metadata keyed by the content file metadata properties + */ + private Map mapSystemToRaw(Map systemMetadata) + { + Map metadataProperties = new HashMap(systemMetadata.size() * 2 + 1); + for (Map.Entry entry : systemMetadata.entrySet()) + { + QName modelProperty = entry.getKey(); + // Check if there is a mapping for this + if (!embedMapping.containsKey(modelProperty)) + { + // No mapping - ignore + continue; + } + Serializable documentValue = entry.getValue(); + Set metadataKeys = embedMapping.get(modelProperty); + for (String metadataKey : metadataKeys) + { + metadataProperties.put(metadataKey, documentValue); + } + } + // Done + if (logger.isDebugEnabled()) + { + logger.debug( + "Converted system model values to metadata values: \n" + + " System Properties: " + systemMetadata + "\n" + + " Metadata Properties: " + metadataProperties); + } + return metadataProperties; + } + /** * Filters the system properties that are going to be applied. Gives the metadata extracter an * opportunity to remove properties that may not be appropriate in a given context. @@ -1054,7 +1472,91 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac // Attempt to load the properties return readMappingProperties(propertiesUrl); } - + + /** + * This method provides a best guess of what model properties should be embedded + * in content. The list of properties mapped by default need not + * include all properties to be embedded in the document; just the obvious set of mappings + * need be supplied. + * Implementations must either provide the default mapping properties in the expected + * location or override the method to provide the default mapping. + *

+ * The default implementation looks for the default mapping file in the location + * given by the class name and .embed.properties. If the extracter's class is + * x.y.z.MyExtracter then the default properties will be picked up at + * classpath:/x/y/z/MyExtracter.embed.properties. + * Inner classes are supported, but the '$' in the class name is replaced with '-', so + * default properties for x.y.z.MyStuff$MyExtracter will be located using + * x.y.z.MyStuff-MyExtracter.embed.properties. + *

+ * The default mapping implementation should include thorough Javadocs so that the + * system administrators can accurately determine how to best enhance or override the + * default mapping. + *

+ * If the default mapping is declared in a properties file other than the one named after + * the class, then the {@link #readEmbedMappingProperties(String)} method can be used to quickly + * generate the return value: + *


+     *      protected Map<> getDefaultMapping()
+     *      {
+     *          return readEmbedMappingProperties(DEFAULT_MAPPING);
+     *      }
+     * 
+ * The map can also be created in code either statically or during the call. + *

+ * If no embed mapping properties file is found a reverse of the extract + * mapping in {@link #getDefaultMapping()} will be assumed with the first QName in each + * value used as the key for this mapping and a last win approach for duplicates. + * + * @return Returns the default, static embed mapping. It may not be null. + * + * @see #setInheritDefaultMapping(boolean inherit) + */ + protected Map> getDefaultEmbedMapping() + { + String className = this.getClass().getName(); + // Replace $ + className = className.replace('$', '-'); + // Replace . + className = className.replace('.', '/'); + // Append .properties + String propertiesUrl = className + ".embed.properties"; + // Attempt to load the properties + Map> embedMapping = readEmbedMappingProperties(propertiesUrl); + if (embedMapping == null) + { + if (logger.isDebugEnabled()) + { + logger.debug("No explicit embed mapping properties found at: " + propertiesUrl + ", assuming reverse of extract mapping"); + } + Map> extractMapping = this.mapping; + if (extractMapping == null || extractMapping.size() == 0) + { + extractMapping = getDefaultMapping(); + } + embedMapping = new HashMap>(extractMapping.size()); + for (String metadataKey : extractMapping.keySet()) + { + if (extractMapping.get(metadataKey) != null && extractMapping.get(metadataKey).size() > 0) + { + QName modelProperty = extractMapping.get(metadataKey).iterator().next(); + Set metadataKeys = embedMapping.get(modelProperty); + if (metadataKeys == null) + { + metadataKeys = new HashSet(1); + embedMapping.put(modelProperty, metadataKeys); + } + metadataKeys.add(metadataKey); + if (logger.isTraceEnabled()) + { + logger.trace("Added mapping from " + modelProperty + " to " + metadataKeys.toString()); + } + } + } + } + return embedMapping; + } + /** * Override to provide the raw extracted metadata values. An extracter should extract * as many of the available properties as is realistically possible. Even if the @@ -1089,4 +1591,25 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac * @see #getDefaultMapping() */ protected abstract Map extractRaw(ContentReader reader) throws Throwable; + + /** + * Override to embed metadata values. An extracter should embed + * as many of the available properties as is realistically possible. Even if the + * {@link #getDefaultEmbedMapping() default mapping} doesn't handle all properties, it is + * possible for each instance of the extracter to be configured differently and more or + * less of the properties may be used in different installations. + * + * @param metadata the metadata keys and values to embed in the content file + * @param reader the reader for the original document. This stream provided by + * the reader must be closed if accessed directly. + * @param writer the writer for the document to embed the values in. This stream provided by + * the writer must be closed if accessed directly. + * @throws All exception conditions can be handled. + * + * @see #getDefaultEmbedMapping() + */ + protected void embedInternal(Map metadata, ContentReader reader, ContentWriter writer) throws Throwable + { + // TODO make this an abstract method once more extracters support embedding + } } diff --git a/source/java/org/alfresco/repo/content/metadata/MetadataEmbedder.java b/source/java/org/alfresco/repo/content/metadata/MetadataEmbedder.java new file mode 100644 index 0000000000..6218c31dae --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/MetadataEmbedder.java @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2005-2012 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.metadata; + +import java.io.Serializable; +import java.util.Map; + +import org.alfresco.repo.content.ContentWorker; +import org.alfresco.service.cmr.repository.ContentIOException; +import org.alfresco.service.cmr.repository.ContentReader; +import org.alfresco.service.cmr.repository.ContentWriter; +import org.alfresco.service.namespace.QName; + +/** + * Interface for writing metadata properties back into the content file. + * + * @author Ray Gauss II + * + */ +public interface MetadataEmbedder extends ContentWorker { + + /** + * Determines if the extracter works against the given mimetype. + * + * @param mimetype the document mimetype + * @return Returns true if the mimetype is supported, otherwise false. + */ + public boolean isEmbeddingSupported(String mimetype); + + /** + * Embeds the given properties into the file specified by the given content writer. + * *

+ * The embedding viability can be determined by an up front call to + * {@link #isSupported(String)}. + *

+ * The source mimetype must be available on the + * {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method + * of the writer. + * + * @param properties the model properties to embed + * @param reader the reader for the original source content file + * @param writer the writer for the content after metadata has been embedded + * @throws ContentIOException + */ + public void embed(Map properties, ContentReader reader, ContentWriter writer) throws ContentIOException; + + +} diff --git a/source/java/org/alfresco/repo/content/metadata/MetadataExtracterRegistry.java b/source/java/org/alfresco/repo/content/metadata/MetadataExtracterRegistry.java index 97b4c02709..4566c84252 100644 --- a/source/java/org/alfresco/repo/content/metadata/MetadataExtracterRegistry.java +++ b/source/java/org/alfresco/repo/content/metadata/MetadataExtracterRegistry.java @@ -44,6 +44,7 @@ public class MetadataExtracterRegistry private List extracters; private Map> extracterCache; + private Map> embedderCache; /** Controls read access to the cache */ private Lock extracterCacheReadLock; @@ -55,6 +56,7 @@ public class MetadataExtracterRegistry // initialise lists extracters = new ArrayList(10); extracterCache = new HashMap>(17); + embedderCache = new HashMap>(17); // create lock objects for access to the cache ReadWriteLock extractionCacheLock = new ReentrantReadWriteLock(); @@ -72,6 +74,7 @@ public class MetadataExtracterRegistry try { extracterCache.clear(); + embedderCache.clear(); } finally { @@ -96,6 +99,7 @@ public class MetadataExtracterRegistry { extracters.add(extracter); extracterCache.clear(); + embedderCache.clear(); } finally { @@ -186,4 +190,92 @@ public class MetadataExtracterRegistry } return extractors; } + + /** + * Gets the best metadata embedder. This is a combination of the most + * reliable and the most performant embedder. + *

+ * The result is cached for quicker access next time. + * + * @param mimetype the source MIME of the extraction + * @return Returns a metadata embedder that can embed metadata in the + * chosen MIME type. + */ + public MetadataEmbedder getEmbedder(String sourceMimetype) + { + List embedders = null; + extracterCacheReadLock.lock(); + try + { + if (embedderCache.containsKey(sourceMimetype)) + { + // the translation has been requested before + // it might have been null + embedders = embedderCache.get(sourceMimetype); + } + } + finally + { + extracterCacheReadLock.unlock(); + } + + if (embedders == null) + { + // No request has been made before + // Get a write lock on the cache + // No double check done as it is not an expensive task + extracterCacheWriteLock.lock(); + try + { + // find the most suitable transformer - may be empty list + embedders = findBestEmbedders(sourceMimetype); + // store the result even if it is null + embedderCache.put(sourceMimetype, embedders); + } + finally + { + extracterCacheWriteLock.unlock(); + } + } + + // We have the list of embedders that supposedly work (as registered). + // Take the last one that still claims to work + MetadataEmbedder liveEmbedder = null; + for (MetadataEmbedder embedder : embedders) + { + // An extractor may dynamically become unavailable + if (!embedder.isEmbeddingSupported(sourceMimetype)) + { + continue; + } + liveEmbedder = embedder; + } + return liveEmbedder; + } + + /** + * @param sourceMimetype The MIME type under examination + * @return Returns a set of embedders that will work for the given mimetype + */ + private List findBestEmbedders(String sourceMimetype) + { + logger.debug("Finding embedders for " + sourceMimetype); + + List embedders = new ArrayList(1); + + for (MetadataExtracter extractor : extracters) + { + if (!(extractor instanceof MetadataEmbedder)) + { + continue; + } + if (!((MetadataEmbedder) extractor).isEmbeddingSupported(sourceMimetype)) + { + // extraction not achievable + continue; + } + embedders.add((MetadataEmbedder)extractor); + } + return embedders; + } } diff --git a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java index 122ae6af3e..3217a49c0d 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2010 Alfresco Software Limited. + * Copyright (C) 2005-2012 Alfresco Software Limited. * * This file is part of Alfresco * @@ -20,11 +20,13 @@ package org.alfresco.repo.content.metadata; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.io.Serializable; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -35,8 +37,12 @@ import java.util.TimeZone; import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.filestore.FileContentReader; import org.alfresco.service.cmr.repository.ContentReader; +import org.alfresco.service.cmr.repository.ContentWriter; +import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; +import org.alfresco.service.cmr.repository.datatype.TypeConversionException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tika.embedder.Embedder; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -72,7 +78,9 @@ import org.xml.sax.SAXException; * @since 3.4 * @author Nick Burch */ -public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetadataExtracter +public abstract class TikaPoweredMetadataExtracter + extends AbstractMappingMetadataExtracter + implements MetadataEmbedder { protected static Log logger = LogFactory.getLog(TikaPoweredMetadataExtracter.class); @@ -118,11 +126,19 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada public TikaPoweredMetadataExtracter(ArrayList supportedMimeTypes) { - this(new HashSet(supportedMimeTypes)); + this(new HashSet(supportedMimeTypes), null); + } + public TikaPoweredMetadataExtracter(ArrayList supportedMimeTypes, ArrayList supportedEmbedMimeTypes) + { + this(new HashSet(supportedMimeTypes), new HashSet(supportedEmbedMimeTypes)); } public TikaPoweredMetadataExtracter(HashSet supportedMimeTypes) { - super(supportedMimeTypes); + this(supportedMimeTypes, null); + } + public TikaPoweredMetadataExtracter(HashSet supportedMimeTypes, HashSet supportedEmbedMimeTypes) + { + super(supportedMimeTypes, supportedEmbedMimeTypes); // TODO Once TIKA-451 is fixed this list will get nicer this.tikaDateFormats = new DateFormat[] { @@ -188,6 +204,18 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada */ protected abstract Parser getParser(); + /** + * Returns the Tika Embedder to modify + * the document. + * + * @return the Tika embedder + */ + protected Embedder getEmbedder() + { + // TODO make this an abstract method once more extracters support embedding + return null; + } + /** * Do we care about the contents of the * extracted header, or nothing at all? @@ -215,7 +243,7 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada * For these cases, buffer out to a local file if not * already there */ - private InputStream getInputStream(ContentReader reader) throws IOException { + protected InputStream getInputStream(ContentReader reader) throws IOException { // Prefer the File if available, it's generally quicker if(reader instanceof FileContentReader) { @@ -338,6 +366,71 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada return rawProperties; } + @Override + protected void embedInternal(Map properties, ContentReader reader, ContentWriter writer) throws Throwable + { + Embedder embedder = getEmbedder(); + if (embedder == null) + { + return; + } + OutputStream outputStream = null; + try + { + Metadata metadataToEmbed = new Metadata(); + for (String metadataKey : properties.keySet()) + { + Serializable value = properties.get(metadataKey); + if (value == null) + { + continue; + } + if (value instanceof Collection) + { + for (Object singleValue : (Collection) value) + { + try + { + // Convert to a string value for Tika + metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, singleValue)); + } + catch (TypeConversionException e) + { + logger.info("Could not convert " + metadataKey + ": " + e.getMessage()); + } + } + } + else + { + try + { + // Convert to a string value for Tika + metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, value)); + } + catch (TypeConversionException e) + { + logger.info("Could not convert " + metadataKey + ": " + e.getMessage()); + } + } + } + InputStream inputStream = getInputStream(reader); + outputStream = writer.getContentOutputStream(); + embedder.embed(metadataToEmbed, inputStream, outputStream, null); + } + catch (Exception e) + { + logger.error(e.getMessage(), e); + } + finally + { + if (outputStream != null) + { + try { outputStream.close(); } catch (Throwable e) {} + } + } + + } + /** * This content handler will capture entries from within * the header of the Tika content XHTML, but ignore the