mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Merged BRANCHES/DEV/RGAUSS/4.2-CORE-CHANGES-42861 to HEAD:
42862: Creating new branch from HEAD 43026: ALF-16403: Create the Basic Interfaces and Implementation for Metadata Embedders - Added MetadataEmbedder interface which guarantees an embed method responsible for writing the given metadata into a given content writer - Changed AbstractMappingMetadataExtracter to implement MetadataEmbedder * Added supportedEmbedMimetypes and constructor which takes it and supportedMimetypes as arguments * Added embedMapping * Added inheritDefaultEmbedMapping * Added isEmbeddingSupported * Added setEmbedMappingProperties * Added readEmbedMappingProperties for reading classname.embed.properties * Added setting of embedMapping in init method * Added checkIsEmbedSupported method * Added embed method which checks support for the mimetype, and calls embedInteral which implementations should override * Added mapSystemToRaw method, essentially a reverse of existing mapRawToSystem * Added getDefaultEmbedMapping method which assumes a reverse mapping of extract mapping if no explicit embed overrides are present * Added empty embedInternal method which does nothing rather than abstract method to minimize changes to existing code - Added notion of MetadataEmbedders to MetadataExtracterRegistry * Added embedderCache but use the existing extracterCache* locks * Added findBestEmbedders method * Added getEmbedder method 43164: ALF-16404: Create a Tika Powered Metadata Embedder - Added constructors for setting of supported embed types to TikaPoweredMetadataExtracter - Changed visibility of getInputStream to protected so subclasses can use it - Logging level changes in AbstractMappingMetadataExtracter 43165: ALF-16481: Create a Content Metadata Embedder Action Executer - Added ContentMetadataEmbedder action executer which gets an embedder for the noderef if available and sends the content reader and writer for the node ref to the embedder's embed method - Added embed-metadata action executer bean - Added embed-metadata action executer messages 43262: ALF-16404: Create a Tika Powered Metadata Embedder - Updated Tika which now contains implementation of TIKA-775: Embed Capabilities 43265: ALF-16404: Create a Tika Powered Metadata Embedder - Added MetadataEmbedder implementation to TikaPoweredMetadataExtracter which gets a Tika Embedder and calls its embed method git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@43268 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -540,6 +540,23 @@
|
|||||||
</property>
|
</property>
|
||||||
</bean>
|
</bean>
|
||||||
|
|
||||||
|
<bean id="embed-metadata" class="org.alfresco.repo.action.executer.ContentMetadataEmbedder" parent="action-executer">
|
||||||
|
<property name="nodeService">
|
||||||
|
<ref bean="NodeService" />
|
||||||
|
</property>
|
||||||
|
<property name="contentService">
|
||||||
|
<ref bean="ContentService" />
|
||||||
|
</property>
|
||||||
|
<property name="metadataExtracterRegistry">
|
||||||
|
<ref bean="metadataExtracterRegistry" />
|
||||||
|
</property>
|
||||||
|
<property name="applicableTypes">
|
||||||
|
<list>
|
||||||
|
<value>{http://www.alfresco.org/model/content/1.0}content</value>
|
||||||
|
</list>
|
||||||
|
</property>
|
||||||
|
</bean>
|
||||||
|
|
||||||
<bean id="import" class="org.alfresco.repo.action.executer.ImporterActionExecuter" parent="action-executer">
|
<bean id="import" class="org.alfresco.repo.action.executer.ImporterActionExecuter" parent="action-executer">
|
||||||
<property name="importerService">
|
<property name="importerService">
|
||||||
<ref bean="ImporterService"/>
|
<ref bean="ImporterService"/>
|
||||||
|
@@ -148,6 +148,9 @@ import.destination.display-label=Destination
|
|||||||
extract-metadata.title=Extract common metadata fields
|
extract-metadata.title=Extract common metadata fields
|
||||||
extract-metadata.description=Imports title, author and description metadata fields from common content types.
|
extract-metadata.description=Imports title, author and description metadata fields from common content types.
|
||||||
|
|
||||||
|
embed-metadata.title=Embed properties as metadata in content
|
||||||
|
embed-metadata.description=This action attempts to embed the content's properties as metadata in the file itself
|
||||||
|
|
||||||
specialise-type.title=Specialise type
|
specialise-type.title=Specialise type
|
||||||
specialise-type.description=This will specialise the matched item to a given type.
|
specialise-type.description=This will specialise the matched item to a given type.
|
||||||
specialise-type.type-name.display-label=Type
|
specialise-type.type-name.display-label=Type
|
||||||
|
@@ -0,0 +1,166 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2012 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This file is part of Alfresco
|
||||||
|
*
|
||||||
|
* Alfresco is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Alfresco is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.action.executer;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.alfresco.model.ContentModel;
|
||||||
|
import org.alfresco.repo.action.executer.ActionExecuterAbstractBase;
|
||||||
|
import org.alfresco.repo.content.metadata.MetadataEmbedder;
|
||||||
|
import org.alfresco.repo.content.metadata.MetadataExtracterRegistry;
|
||||||
|
import org.alfresco.service.cmr.action.Action;
|
||||||
|
import org.alfresco.service.cmr.action.ParameterDefinition;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentService;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||||
|
import org.alfresco.service.cmr.repository.NodeRef;
|
||||||
|
import org.alfresco.service.cmr.repository.NodeService;
|
||||||
|
import org.alfresco.service.namespace.QName;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Embed metadata in any content.
|
||||||
|
* <p>
|
||||||
|
* The metadata is embedded in the content from the current
|
||||||
|
* property values.
|
||||||
|
*
|
||||||
|
* @author Jesper Steen Møller, Ray Gauss II
|
||||||
|
*/
|
||||||
|
public class ContentMetadataEmbedder extends ActionExecuterAbstractBase
|
||||||
|
{
|
||||||
|
private static Log logger = LogFactory.getLog(ContentMetadataEmbedder.class);
|
||||||
|
|
||||||
|
public static final String EXECUTOR_NAME = "embed-metadata";
|
||||||
|
|
||||||
|
private NodeService nodeService;
|
||||||
|
private ContentService contentService;
|
||||||
|
private MetadataExtracterRegistry metadataExtracterRegistry;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param nodeService the node service
|
||||||
|
*/
|
||||||
|
public void setNodeService(NodeService nodeService)
|
||||||
|
{
|
||||||
|
this.nodeService = nodeService;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param contentService The contentService to set.
|
||||||
|
*/
|
||||||
|
public void setContentService(ContentService contentService)
|
||||||
|
{
|
||||||
|
this.contentService = contentService;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param metadataExtracterRegistry The metadataExtracterRegistry to set.
|
||||||
|
*/
|
||||||
|
public void setMetadataExtracterRegistry(MetadataExtracterRegistry metadataExtracterRegistry)
|
||||||
|
{
|
||||||
|
this.metadataExtracterRegistry = metadataExtracterRegistry;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see org.alfresco.repo.action.executer.ActionExecuter#execute(org.alfresco.service.cmr.repository.NodeRef,
|
||||||
|
* NodeRef)
|
||||||
|
*/
|
||||||
|
public void executeImpl(Action ruleAction, NodeRef actionedUponNodeRef)
|
||||||
|
{
|
||||||
|
if (!nodeService.exists(actionedUponNodeRef))
|
||||||
|
{
|
||||||
|
// Node is gone
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ContentReader reader = contentService.getReader(actionedUponNodeRef, ContentModel.PROP_CONTENT);
|
||||||
|
// The reader may be null, e.g. for folders and the like
|
||||||
|
if (reader == null || reader.getMimetype() == null)
|
||||||
|
{
|
||||||
|
if(logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug("no content or mimetype - do nothing");
|
||||||
|
}
|
||||||
|
// No content to extract data from
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
String mimetype = reader.getMimetype();
|
||||||
|
MetadataEmbedder embedder = metadataExtracterRegistry.getEmbedder(mimetype);
|
||||||
|
if (embedder == null)
|
||||||
|
{
|
||||||
|
if(logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug("no embedder for mimetype:" + mimetype);
|
||||||
|
}
|
||||||
|
// There is no embedder to use
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ContentWriter writer = contentService.getWriter(actionedUponNodeRef, ContentModel.PROP_CONTENT, true);
|
||||||
|
// The writer may be null, e.g. for folders and the like
|
||||||
|
if (writer == null || writer.getMimetype() == null)
|
||||||
|
{
|
||||||
|
if(logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug("no content or mimetype - do nothing");
|
||||||
|
}
|
||||||
|
// No content to embed data in
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get all the node's properties
|
||||||
|
Map<QName, Serializable> nodeProperties = nodeService.getProperties(actionedUponNodeRef);
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
embedder.embed(nodeProperties, reader, writer);
|
||||||
|
}
|
||||||
|
catch (Throwable e)
|
||||||
|
{
|
||||||
|
// Extracters should attempt to handle all error conditions and embed
|
||||||
|
// as much as they can. If, however, one should fail, we don't want the
|
||||||
|
// action itself to fail. We absorb and report the exception here.
|
||||||
|
if (logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug(
|
||||||
|
"Meetadata embedding failed: \n" +
|
||||||
|
" Extracter: " + this + "\n" +
|
||||||
|
" Node: " + actionedUponNodeRef + "\n" +
|
||||||
|
" Content: " + writer,
|
||||||
|
e);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
logger.warn(
|
||||||
|
"Metadata embedding failed (turn on DEBUG for full error): \n" +
|
||||||
|
" Extracter: " + this + "\n" +
|
||||||
|
" Node: " + actionedUponNodeRef + "\n" +
|
||||||
|
" Content: " + writer + "\n" +
|
||||||
|
" Failure: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void addParameterDefinitions(List<ParameterDefinition> arg0)
|
||||||
|
{
|
||||||
|
// None!
|
||||||
|
}
|
||||||
|
}
|
@@ -42,6 +42,7 @@ import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
|
|||||||
import org.alfresco.service.cmr.dictionary.DictionaryService;
|
import org.alfresco.service.cmr.dictionary.DictionaryService;
|
||||||
import org.alfresco.service.cmr.dictionary.PropertyDefinition;
|
import org.alfresco.service.cmr.dictionary.PropertyDefinition;
|
||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||||
import org.alfresco.service.cmr.repository.MimetypeService;
|
import org.alfresco.service.cmr.repository.MimetypeService;
|
||||||
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||||
import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
|
import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
|
||||||
@@ -92,7 +93,7 @@ import org.springframework.extensions.surf.util.ISO8601DateFormat;
|
|||||||
* @author Jesper Steen Møller
|
* @author Jesper Steen Møller
|
||||||
* @author Derek Hulley
|
* @author Derek Hulley
|
||||||
*/
|
*/
|
||||||
abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter
|
abstract public class AbstractMappingMetadataExtracter implements MetadataExtracter, MetadataEmbedder
|
||||||
{
|
{
|
||||||
public static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix.";
|
public static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix.";
|
||||||
private static final String ERR_TYPE_CONVERSION = "metadata.extraction.err.type_conversion";
|
private static final String ERR_TYPE_CONVERSION = "metadata.extraction.err.type_conversion";
|
||||||
@@ -105,11 +106,14 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
private boolean initialized;
|
private boolean initialized;
|
||||||
|
|
||||||
private Set<String> supportedMimetypes;
|
private Set<String> supportedMimetypes;
|
||||||
|
private Set<String> supportedEmbedMimetypes;
|
||||||
private OverwritePolicy overwritePolicy;
|
private OverwritePolicy overwritePolicy;
|
||||||
private boolean failOnTypeConversion;
|
private boolean failOnTypeConversion;
|
||||||
protected Set<DateFormat> supportedDateFormats = new HashSet<DateFormat>(0);
|
protected Set<DateFormat> supportedDateFormats = new HashSet<DateFormat>(0);
|
||||||
private Map<String, Set<QName>> mapping;
|
private Map<String, Set<QName>> mapping;
|
||||||
|
private Map<QName, Set<String>> embedMapping;
|
||||||
private boolean inheritDefaultMapping;
|
private boolean inheritDefaultMapping;
|
||||||
|
private boolean inheritDefaultEmbedMapping;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Default constructor. If this is called, then {@link #isSupported(String)} should
|
* Default constructor. If this is called, then {@link #isSupported(String)} should
|
||||||
@@ -137,10 +141,24 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
overwritePolicy = OverwritePolicy.PRAGMATIC;
|
overwritePolicy = OverwritePolicy.PRAGMATIC;
|
||||||
failOnTypeConversion = true;
|
failOnTypeConversion = true;
|
||||||
mapping = null; // The default will be fetched
|
mapping = null; // The default will be fetched
|
||||||
|
embedMapping = null;
|
||||||
inheritDefaultMapping = false; // Any overrides are complete
|
inheritDefaultMapping = false; // Any overrides are complete
|
||||||
|
inheritDefaultEmbedMapping = false;
|
||||||
initialized = false;
|
initialized = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor that can be used when the list of supported extract and embed mimetypes is known up front.
|
||||||
|
*
|
||||||
|
* @param supportedMimetypes the set of mimetypes supported for extraction by default
|
||||||
|
* @param supportedEmbedMimetypes the set of mimetypes supported for embedding by default
|
||||||
|
*/
|
||||||
|
protected AbstractMappingMetadataExtracter(Set<String> supportedMimetypes, Set<String> supportedEmbedMimetypes)
|
||||||
|
{
|
||||||
|
this(supportedMimetypes);
|
||||||
|
this.supportedEmbedMimetypes = supportedEmbedMimetypes;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the registry to register with. If this is not set, then the default
|
* Set the registry to register with. If this is not set, then the default
|
||||||
* initialization will not auto-register the extracter for general use. It
|
* initialization will not auto-register the extracter for general use. It
|
||||||
@@ -188,6 +206,17 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
this.supportedMimetypes.addAll(supportedMimetypes);
|
this.supportedMimetypes.addAll(supportedMimetypes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the mimetypes that are supported for embedding.
|
||||||
|
*
|
||||||
|
* @param supportedEmbedMimetypes
|
||||||
|
*/
|
||||||
|
public void setSupportedEmbedMimetypes(Collection<String> supportedEmbedMimetypes)
|
||||||
|
{
|
||||||
|
this.supportedEmbedMimetypes.clear();
|
||||||
|
this.supportedEmbedMimetypes.addAll(supportedEmbedMimetypes);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@inheritDoc}
|
* {@inheritDoc}
|
||||||
*
|
*
|
||||||
@@ -198,6 +227,20 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
return supportedMimetypes.contains(sourceMimetype);
|
return supportedMimetypes.contains(sourceMimetype);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*
|
||||||
|
* @see #setSupportedEmbedMimetypes(Collection)
|
||||||
|
*/
|
||||||
|
public boolean isEmbeddingSupported(String sourceMimetype)
|
||||||
|
{
|
||||||
|
if (supportedEmbedMimetypes == null)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return supportedEmbedMimetypes.contains(sourceMimetype);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
|
* TODO - This doesn't appear to be used, so should be removed / deprecated / replaced
|
||||||
* @return Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
|
* @return Returns <code>1.0</code> if the mimetype is supported, otherwise <tt>0.0</tt>
|
||||||
@@ -308,6 +351,23 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
this.inheritDefaultMapping = inheritDefaultMapping;
|
this.inheritDefaultMapping = inheritDefaultMapping;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set if the embed property mappings augment or override the mapping generically provided by the
|
||||||
|
* extracter implementation. The default is <tt>false</tt>, i.e. any mapping set completely
|
||||||
|
* replaces the {@link #getDefaultEmbedMapping() default mappings}.
|
||||||
|
*
|
||||||
|
* @param inheritDefaultEmbedMapping <tt>true</tt> to add the configured embed mapping
|
||||||
|
* to the list of default embed mappings.
|
||||||
|
*
|
||||||
|
* @see #getDefaultEmbedMapping()
|
||||||
|
* @see #setEmbedMapping(Map)
|
||||||
|
* @see #setEmbedMappingProperties(Properties)
|
||||||
|
*/
|
||||||
|
public void setInheritDefaultEmbedMapping(boolean inheritDefaultEmbedMapping)
|
||||||
|
{
|
||||||
|
this.inheritDefaultEmbedMapping = inheritDefaultEmbedMapping;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the mapping from document metadata to system metadata. It is possible to direct
|
* Set the mapping from document metadata to system metadata. It is possible to direct
|
||||||
* an extracted document property to several system properties. The conversion between
|
* an extracted document property to several system properties. The conversion between
|
||||||
@@ -321,6 +381,19 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
this.mapping = mapping;
|
this.mapping = mapping;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the embed mapping from document metadata to system metadata. It is possible to direct
|
||||||
|
* an model properties to several content file metadata keys. The conversion between
|
||||||
|
* the model property types and the content file metadata keys types will be done by the
|
||||||
|
* {@link org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter default converter}.
|
||||||
|
*
|
||||||
|
* @param embedMapping an embed mapping from model properties to content file metadata keys
|
||||||
|
*/
|
||||||
|
public void setEmbedMapping(Map<QName, Set<String>> embedMapping)
|
||||||
|
{
|
||||||
|
this.embedMapping = embedMapping;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the properties that contain the mapping from document metadata to system metadata.
|
* Set the properties that contain the mapping from document metadata to system metadata.
|
||||||
* This is an alternative to the {@link #setMapping(Map)} method. Any mappings already
|
* This is an alternative to the {@link #setMapping(Map)} method. Any mappings already
|
||||||
@@ -347,6 +420,32 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
mapping = readMappingProperties(mappingProperties);
|
mapping = readMappingProperties(mappingProperties);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the properties that contain the embed mapping from model properties to content file metadata.
|
||||||
|
* This is an alternative to the {@link #setEmbedMapping(Map)} method. Any mappings already
|
||||||
|
* present will be cleared out.
|
||||||
|
*
|
||||||
|
* The property mapping is of the form:
|
||||||
|
* <pre>
|
||||||
|
* # Namespaces prefixes
|
||||||
|
* namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||||
|
* namespace.prefix.my=http://www....com/alfresco/1.0
|
||||||
|
*
|
||||||
|
* # Mapping
|
||||||
|
* cm\:author=editor
|
||||||
|
* cm\:title=title
|
||||||
|
* cm\:summary=user1
|
||||||
|
* cm\:description=description,user2
|
||||||
|
* </pre>
|
||||||
|
* The embed mapping can therefore be from a model property onto several content file metadata properties.
|
||||||
|
*
|
||||||
|
* @param embedMappingProperties the properties that map model properties to content file metadata properties
|
||||||
|
*/
|
||||||
|
public void setEmbedMappingProperties(Properties embedMappingProperties)
|
||||||
|
{
|
||||||
|
embedMapping = readEmbedMappingProperties(embedMappingProperties);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper method for derived classes to obtain the mappings that will be applied to raw
|
* Helper method for derived classes to obtain the mappings that will be applied to raw
|
||||||
* values. This should be called after initialization in order to guarantee the complete
|
* values. This should be called after initialization in order to guarantee the complete
|
||||||
@@ -373,6 +472,28 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
return Collections.unmodifiableMap(mapping);
|
return Collections.unmodifiableMap(mapping);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper method for derived classes to obtain the embed mappings.
|
||||||
|
* This should be called after initialization in order to guarantee the complete
|
||||||
|
* map is given.
|
||||||
|
* <p>
|
||||||
|
* Normally, the list of properties that can be embedded in a document is fixed and
|
||||||
|
* well-known.. But some implementations may have
|
||||||
|
* an extra, indeterminate set of values available for embedding. If the embedding of
|
||||||
|
* these runtime parameters is expensive, then the keys provided by the return value can
|
||||||
|
* be used to embed values in the documents. The metadata embedding becomes fully
|
||||||
|
* configuration-driven, i.e. declaring further mappings will result in more values being
|
||||||
|
* embedded in the documents.
|
||||||
|
*/
|
||||||
|
protected final Map<QName, Set<String>> getEmbedMapping()
|
||||||
|
{
|
||||||
|
if (!initialized)
|
||||||
|
{
|
||||||
|
throw new UnsupportedOperationException("The complete embed mapping is only available after initialization.");
|
||||||
|
}
|
||||||
|
return Collections.unmodifiableMap(embedMapping);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A utility method to read mapping properties from a resource file and convert to the map form.
|
* A utility method to read mapping properties from a resource file and convert to the map form.
|
||||||
*
|
*
|
||||||
@@ -490,9 +611,131 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
" Mapping: " + entry);
|
" Mapping: " + entry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (logger.isTraceEnabled())
|
||||||
|
{
|
||||||
|
logger.trace("Added mapping from " + documentProperty + " to " + qnames);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Done
|
||||||
|
return convertedMapping;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A utility method to read embed mapping properties from a resource file and convert to the map form.
|
||||||
|
*
|
||||||
|
* @param propertiesUrl A standard Properties file URL location
|
||||||
|
*
|
||||||
|
* @see #setEmbedMappingProperties(Properties)
|
||||||
|
*/
|
||||||
|
protected Map<QName, Set<String>> readEmbedMappingProperties(String propertiesUrl)
|
||||||
|
{
|
||||||
|
InputStream is = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
is = getClass().getClassLoader().getResourceAsStream(propertiesUrl);
|
||||||
|
if(is == null)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
Properties props = new Properties();
|
||||||
|
props.load(is);
|
||||||
|
// Process it
|
||||||
|
Map<QName, Set<String>> map = readEmbedMappingProperties(props);
|
||||||
|
// Done
|
||||||
if (logger.isDebugEnabled())
|
if (logger.isDebugEnabled())
|
||||||
{
|
{
|
||||||
logger.debug("Added mapping from " + documentProperty + " to " + qnames);
|
logger.debug("Loaded embed mapping properties from resource: " + propertiesUrl);
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
catch (Throwable e)
|
||||||
|
{
|
||||||
|
throw new AlfrescoRuntimeException(
|
||||||
|
"Unable to load properties file to read extracter embed mapping properties: \n" +
|
||||||
|
" Extracter: " + this + "\n" +
|
||||||
|
" Bundle: " + propertiesUrl,
|
||||||
|
e);
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (is != null)
|
||||||
|
{
|
||||||
|
try { is.close(); } catch (Throwable e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A utility method to convert mapping properties to the Map form.
|
||||||
|
* <p>
|
||||||
|
* Different from readMappingProperties in that keys are the Alfresco QNames
|
||||||
|
* and values are file metadata properties.
|
||||||
|
*
|
||||||
|
* @see #setMappingProperties(Properties)
|
||||||
|
*/
|
||||||
|
protected Map<QName, Set<String>> readEmbedMappingProperties(Properties mappingProperties)
|
||||||
|
{
|
||||||
|
Map<String, String> namespacesByPrefix = new HashMap<String, String>(5);
|
||||||
|
// Get the namespaces
|
||||||
|
for (Map.Entry<Object, Object> entry : mappingProperties.entrySet())
|
||||||
|
{
|
||||||
|
String propertyName = (String) entry.getKey();
|
||||||
|
if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX))
|
||||||
|
{
|
||||||
|
String prefix = propertyName.substring(17);
|
||||||
|
String namespace = (String) entry.getValue();
|
||||||
|
namespacesByPrefix.put(prefix, namespace);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Create the mapping
|
||||||
|
Map<QName, Set<String>> convertedMapping = new HashMap<QName, Set<String>>(17);
|
||||||
|
for (Map.Entry<Object, Object> entry : mappingProperties.entrySet())
|
||||||
|
{
|
||||||
|
String modelProperty = (String) entry.getKey();
|
||||||
|
String metadataKeysString = (String) entry.getValue();
|
||||||
|
if (modelProperty.startsWith(NAMESPACE_PROPERTY_PREFIX))
|
||||||
|
{
|
||||||
|
// Ignore these now
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int index = modelProperty.indexOf(QName.NAMESPACE_PREFIX);
|
||||||
|
if (index > -1 && modelProperty.charAt(0) != QName.NAMESPACE_BEGIN)
|
||||||
|
{
|
||||||
|
String prefix = modelProperty.substring(0, index);
|
||||||
|
String suffix = modelProperty.substring(index + 1);
|
||||||
|
// It is prefixed
|
||||||
|
String uri = namespacesByPrefix.get(prefix);
|
||||||
|
if (uri == null)
|
||||||
|
{
|
||||||
|
throw new AlfrescoRuntimeException(
|
||||||
|
"No prefix mapping for embed property mapping: \n" +
|
||||||
|
" Extracter: " + this + "\n" +
|
||||||
|
" Mapping: " + entry);
|
||||||
|
}
|
||||||
|
modelProperty = QName.NAMESPACE_BEGIN + uri + QName.NAMESPACE_END + suffix;
|
||||||
|
}
|
||||||
|
try
|
||||||
|
{
|
||||||
|
QName qname = QName.createQName(modelProperty);
|
||||||
|
String[] metadataKeysArray = metadataKeysString.split(",");
|
||||||
|
Set<String> metadataKeys = new HashSet<String>(metadataKeysArray.length);
|
||||||
|
for (String metadataKey : metadataKeysArray) {
|
||||||
|
metadataKeys.add(metadataKey.trim());
|
||||||
|
}
|
||||||
|
// Create the entry
|
||||||
|
convertedMapping.put(qname, metadataKeys);
|
||||||
|
}
|
||||||
|
catch (InvalidQNameException e)
|
||||||
|
{
|
||||||
|
throw new AlfrescoRuntimeException(
|
||||||
|
"Can't create metadata embedding property mapping: \n" +
|
||||||
|
" Extracter: " + this + "\n" +
|
||||||
|
" Mapping: " + entry);
|
||||||
|
}
|
||||||
|
if (logger.isTraceEnabled())
|
||||||
|
{
|
||||||
|
logger.trace("Added mapping from " + modelProperty + " to " + metadataKeysString);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Done
|
// Done
|
||||||
@@ -560,6 +803,31 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
"There are no property mappings for the metadata extracter.\n" +
|
"There are no property mappings for the metadata extracter.\n" +
|
||||||
" Nothing will be extracted by: " + this);
|
" Nothing will be extracted by: " + this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Map<QName, Set<String>> defaultEmbedMapping = getDefaultEmbedMapping();
|
||||||
|
|
||||||
|
// Was a mapping explicitly provided
|
||||||
|
if (embedMapping == null)
|
||||||
|
{
|
||||||
|
// No mapping, so use the default
|
||||||
|
embedMapping = defaultEmbedMapping;
|
||||||
|
}
|
||||||
|
else if (inheritDefaultEmbedMapping)
|
||||||
|
{
|
||||||
|
// Merge the default mapping into the configured mapping
|
||||||
|
for (QName modelProperty : defaultEmbedMapping.keySet())
|
||||||
|
{
|
||||||
|
Set<String> metadataKeys = embedMapping.get(modelProperty);
|
||||||
|
if (metadataKeys == null)
|
||||||
|
{
|
||||||
|
metadataKeys = new HashSet<String>(3);
|
||||||
|
embedMapping.put(modelProperty, metadataKeys);
|
||||||
|
}
|
||||||
|
Set<String> defaultMetadataKeys = defaultEmbedMapping.get(modelProperty);
|
||||||
|
metadataKeys.addAll(defaultMetadataKeys);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Done
|
// Done
|
||||||
initialized = true;
|
initialized = true;
|
||||||
}
|
}
|
||||||
@@ -589,6 +857,25 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if embedding for the mimetype is supported.
|
||||||
|
*
|
||||||
|
* @param writer the writer to check
|
||||||
|
* @throws AlfrescoRuntimeException if embedding for the mimetype is not supported
|
||||||
|
*/
|
||||||
|
protected void checkIsEmbedSupported(ContentWriter writer)
|
||||||
|
{
|
||||||
|
String mimetype = writer.getMimetype();
|
||||||
|
if (!isSupported(mimetype))
|
||||||
|
{
|
||||||
|
throw new AlfrescoRuntimeException(
|
||||||
|
"Metadata extracter does not support embedding mimetype: \n" +
|
||||||
|
" writer: " + writer + "\n" +
|
||||||
|
" supported: " + supportedEmbedMimetypes + "\n" +
|
||||||
|
" extracter: " + this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@inheritDoc}
|
* {@inheritDoc}
|
||||||
*/
|
*/
|
||||||
@@ -731,6 +1018,102 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
return changedProperties;
|
return changedProperties;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
public final void embed(
|
||||||
|
Map<QName, Serializable> properties,
|
||||||
|
ContentReader reader,
|
||||||
|
ContentWriter writer)
|
||||||
|
{
|
||||||
|
// Done
|
||||||
|
if (logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug("Starting metadata embedding: \n" +
|
||||||
|
" reader: " + reader + "\n" +
|
||||||
|
" writer: " + writer + "\n" +
|
||||||
|
" extracter: " + this);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!initialized)
|
||||||
|
{
|
||||||
|
throw new AlfrescoRuntimeException(
|
||||||
|
"Metadata extracter not initialized.\n" +
|
||||||
|
" Call the 'register' method on: " + this + "\n" +
|
||||||
|
" Implementations of the 'init' method must call the base implementation.");
|
||||||
|
}
|
||||||
|
// check the reliability
|
||||||
|
checkIsEmbedSupported(writer);
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
embedInternal(mapSystemToRaw(properties), reader, writer);
|
||||||
|
if(logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug("Embedded Metadata into " + writer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Throwable e)
|
||||||
|
{
|
||||||
|
// Ask Tika to detect the document, and report back on if
|
||||||
|
// the current mime type is plausible
|
||||||
|
String typeErrorMessage = null;
|
||||||
|
String differentType = null;
|
||||||
|
if(mimetypeService != null)
|
||||||
|
{
|
||||||
|
differentType = mimetypeService.getMimetypeIfNotMatches(writer.getReader());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
logger.info("Unable to verify mimetype of " + writer.getReader() +
|
||||||
|
" as no MimetypeService available to " + getClass().getName());
|
||||||
|
}
|
||||||
|
if(differentType != null)
|
||||||
|
{
|
||||||
|
typeErrorMessage = "\n" +
|
||||||
|
" claimed mime type: " + writer.getMimetype() + "\n" +
|
||||||
|
" detected mime type: " + differentType;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug(
|
||||||
|
"Metadata embedding failed: \n" +
|
||||||
|
" Extracter: " + this + "\n" +
|
||||||
|
" Content: " + writer +
|
||||||
|
typeErrorMessage,
|
||||||
|
e);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
logger.warn(
|
||||||
|
"Metadata embedding failed (turn on DEBUG for full error): \n" +
|
||||||
|
" Extracter: " + this + "\n" +
|
||||||
|
" Content: " + writer + "\n" +
|
||||||
|
" Failure: " + e.getMessage() +
|
||||||
|
typeErrorMessage);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
// check that the writer was closed (if used)
|
||||||
|
if (writer.isChannelOpen())
|
||||||
|
{
|
||||||
|
logger.error("Content writer not closed by metadata extracter: \n" +
|
||||||
|
" writer: " + writer + "\n" +
|
||||||
|
" extracter: " + this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Done
|
||||||
|
if (logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug("Completed metadata embedding: \n" +
|
||||||
|
" writer: " + writer + "\n" +
|
||||||
|
" extracter: " + this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param rawMetadata Metadata keyed by document properties
|
* @param rawMetadata Metadata keyed by document properties
|
||||||
@@ -766,6 +1149,41 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
return systemProperties;
|
return systemProperties;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param systemMetadata Metadata keyed by system properties
|
||||||
|
* @return Returns the metadata keyed by the content file metadata properties
|
||||||
|
*/
|
||||||
|
private Map<String, Serializable> mapSystemToRaw(Map<QName, Serializable> systemMetadata)
|
||||||
|
{
|
||||||
|
Map<String, Serializable> metadataProperties = new HashMap<String, Serializable>(systemMetadata.size() * 2 + 1);
|
||||||
|
for (Map.Entry<QName, Serializable> entry : systemMetadata.entrySet())
|
||||||
|
{
|
||||||
|
QName modelProperty = entry.getKey();
|
||||||
|
// Check if there is a mapping for this
|
||||||
|
if (!embedMapping.containsKey(modelProperty))
|
||||||
|
{
|
||||||
|
// No mapping - ignore
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Serializable documentValue = entry.getValue();
|
||||||
|
Set<String> metadataKeys = embedMapping.get(modelProperty);
|
||||||
|
for (String metadataKey : metadataKeys)
|
||||||
|
{
|
||||||
|
metadataProperties.put(metadataKey, documentValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Done
|
||||||
|
if (logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug(
|
||||||
|
"Converted system model values to metadata values: \n" +
|
||||||
|
" System Properties: " + systemMetadata + "\n" +
|
||||||
|
" Metadata Properties: " + metadataProperties);
|
||||||
|
}
|
||||||
|
return metadataProperties;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filters the system properties that are going to be applied. Gives the metadata extracter an
|
* Filters the system properties that are going to be applied. Gives the metadata extracter an
|
||||||
* opportunity to remove properties that may not be appropriate in a given context.
|
* opportunity to remove properties that may not be appropriate in a given context.
|
||||||
@@ -1055,6 +1473,90 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
return readMappingProperties(propertiesUrl);
|
return readMappingProperties(propertiesUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method provides a <i>best guess</i> of what model properties should be embedded
|
||||||
|
* in content. The list of properties mapped by default need <b>not</b>
|
||||||
|
* include all properties to be embedded in the document; just the obvious set of mappings
|
||||||
|
* need be supplied.
|
||||||
|
* Implementations must either provide the default mapping properties in the expected
|
||||||
|
* location or override the method to provide the default mapping.
|
||||||
|
* <p>
|
||||||
|
* The default implementation looks for the default mapping file in the location
|
||||||
|
* given by the class name and <i>.embed.properties</i>. If the extracter's class is
|
||||||
|
* <b>x.y.z.MyExtracter</b> then the default properties will be picked up at
|
||||||
|
* <b>classpath:/x/y/z/MyExtracter.embed.properties</b>.
|
||||||
|
* Inner classes are supported, but the '$' in the class name is replaced with '-', so
|
||||||
|
* default properties for <b>x.y.z.MyStuff$MyExtracter</b> will be located using
|
||||||
|
* <b>x.y.z.MyStuff-MyExtracter.embed.properties</b>.
|
||||||
|
* <p>
|
||||||
|
* The default mapping implementation should include thorough Javadocs so that the
|
||||||
|
* system administrators can accurately determine how to best enhance or override the
|
||||||
|
* default mapping.
|
||||||
|
* <p>
|
||||||
|
* If the default mapping is declared in a properties file other than the one named after
|
||||||
|
* the class, then the {@link #readEmbedMappingProperties(String)} method can be used to quickly
|
||||||
|
* generate the return value:
|
||||||
|
* <pre><code>
|
||||||
|
* protected Map<<String, Set<QName>> getDefaultMapping()
|
||||||
|
* {
|
||||||
|
* return readEmbedMappingProperties(DEFAULT_MAPPING);
|
||||||
|
* }
|
||||||
|
* </code></pre>
|
||||||
|
* The map can also be created in code either statically or during the call.
|
||||||
|
* <p>
|
||||||
|
* If no embed mapping properties file is found a reverse of the extract
|
||||||
|
* mapping in {@link #getDefaultMapping()} will be assumed with the first QName in each
|
||||||
|
* value used as the key for this mapping and a last win approach for duplicates.
|
||||||
|
*
|
||||||
|
* @return Returns the default, static embed mapping. It may not be null.
|
||||||
|
*
|
||||||
|
* @see #setInheritDefaultMapping(boolean inherit)
|
||||||
|
*/
|
||||||
|
protected Map<QName, Set<String>> getDefaultEmbedMapping()
|
||||||
|
{
|
||||||
|
String className = this.getClass().getName();
|
||||||
|
// Replace $
|
||||||
|
className = className.replace('$', '-');
|
||||||
|
// Replace .
|
||||||
|
className = className.replace('.', '/');
|
||||||
|
// Append .properties
|
||||||
|
String propertiesUrl = className + ".embed.properties";
|
||||||
|
// Attempt to load the properties
|
||||||
|
Map<QName, Set<String>> embedMapping = readEmbedMappingProperties(propertiesUrl);
|
||||||
|
if (embedMapping == null)
|
||||||
|
{
|
||||||
|
if (logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug("No explicit embed mapping properties found at: " + propertiesUrl + ", assuming reverse of extract mapping");
|
||||||
|
}
|
||||||
|
Map<String, Set<QName>> extractMapping = this.mapping;
|
||||||
|
if (extractMapping == null || extractMapping.size() == 0)
|
||||||
|
{
|
||||||
|
extractMapping = getDefaultMapping();
|
||||||
|
}
|
||||||
|
embedMapping = new HashMap<QName, Set<String>>(extractMapping.size());
|
||||||
|
for (String metadataKey : extractMapping.keySet())
|
||||||
|
{
|
||||||
|
if (extractMapping.get(metadataKey) != null && extractMapping.get(metadataKey).size() > 0)
|
||||||
|
{
|
||||||
|
QName modelProperty = extractMapping.get(metadataKey).iterator().next();
|
||||||
|
Set<String> metadataKeys = embedMapping.get(modelProperty);
|
||||||
|
if (metadataKeys == null)
|
||||||
|
{
|
||||||
|
metadataKeys = new HashSet<String>(1);
|
||||||
|
embedMapping.put(modelProperty, metadataKeys);
|
||||||
|
}
|
||||||
|
metadataKeys.add(metadataKey);
|
||||||
|
if (logger.isTraceEnabled())
|
||||||
|
{
|
||||||
|
logger.trace("Added mapping from " + modelProperty + " to " + metadataKeys.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return embedMapping;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Override to provide the raw extracted metadata values. An extracter should extract
|
* Override to provide the raw extracted metadata values. An extracter should extract
|
||||||
* as many of the available properties as is realistically possible. Even if the
|
* as many of the available properties as is realistically possible. Even if the
|
||||||
@@ -1089,4 +1591,25 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
* @see #getDefaultMapping()
|
* @see #getDefaultMapping()
|
||||||
*/
|
*/
|
||||||
protected abstract Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable;
|
protected abstract Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Override to embed metadata values. An extracter should embed
|
||||||
|
* as many of the available properties as is realistically possible. Even if the
|
||||||
|
* {@link #getDefaultEmbedMapping() default mapping} doesn't handle all properties, it is
|
||||||
|
* possible for each instance of the extracter to be configured differently and more or
|
||||||
|
* less of the properties may be used in different installations.
|
||||||
|
*
|
||||||
|
* @param metadata the metadata keys and values to embed in the content file
|
||||||
|
* @param reader the reader for the original document. This stream provided by
|
||||||
|
* the reader must be closed if accessed directly.
|
||||||
|
* @param writer the writer for the document to embed the values in. This stream provided by
|
||||||
|
* the writer must be closed if accessed directly.
|
||||||
|
* @throws All exception conditions can be handled.
|
||||||
|
*
|
||||||
|
* @see #getDefaultEmbedMapping()
|
||||||
|
*/
|
||||||
|
protected void embedInternal(Map<String, Serializable> metadata, ContentReader reader, ContentWriter writer) throws Throwable
|
||||||
|
{
|
||||||
|
// TODO make this an abstract method once more extracters support embedding
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,64 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2012 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This file is part of Alfresco
|
||||||
|
*
|
||||||
|
* Alfresco is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Alfresco is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content.metadata;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.alfresco.repo.content.ContentWorker;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||||
|
import org.alfresco.service.namespace.QName;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Interface for writing metadata properties back into the content file.
|
||||||
|
*
|
||||||
|
* @author Ray Gauss II
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public interface MetadataEmbedder extends ContentWorker {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines if the extracter works against the given mimetype.
|
||||||
|
*
|
||||||
|
* @param mimetype the document mimetype
|
||||||
|
* @return Returns <tt>true</tt> if the mimetype is supported, otherwise <tt>false</tt>.
|
||||||
|
*/
|
||||||
|
public boolean isEmbeddingSupported(String mimetype);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Embeds the given properties into the file specified by the given content writer.
|
||||||
|
* * <p>
|
||||||
|
* The embedding viability can be determined by an up front call to
|
||||||
|
* {@link #isSupported(String)}.
|
||||||
|
* <p>
|
||||||
|
* The source mimetype <b>must</b> be available on the
|
||||||
|
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
|
||||||
|
* of the writer.
|
||||||
|
*
|
||||||
|
* @param properties the model properties to embed
|
||||||
|
* @param reader the reader for the original source content file
|
||||||
|
* @param writer the writer for the content after metadata has been embedded
|
||||||
|
* @throws ContentIOException
|
||||||
|
*/
|
||||||
|
public void embed(Map<QName, Serializable> properties, ContentReader reader, ContentWriter writer) throws ContentIOException;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -44,6 +44,7 @@ public class MetadataExtracterRegistry
|
|||||||
|
|
||||||
private List<MetadataExtracter> extracters;
|
private List<MetadataExtracter> extracters;
|
||||||
private Map<String, List<MetadataExtracter>> extracterCache;
|
private Map<String, List<MetadataExtracter>> extracterCache;
|
||||||
|
private Map<String, List<MetadataEmbedder>> embedderCache;
|
||||||
|
|
||||||
/** Controls read access to the cache */
|
/** Controls read access to the cache */
|
||||||
private Lock extracterCacheReadLock;
|
private Lock extracterCacheReadLock;
|
||||||
@@ -55,6 +56,7 @@ public class MetadataExtracterRegistry
|
|||||||
// initialise lists
|
// initialise lists
|
||||||
extracters = new ArrayList<MetadataExtracter>(10);
|
extracters = new ArrayList<MetadataExtracter>(10);
|
||||||
extracterCache = new HashMap<String, List<MetadataExtracter>>(17);
|
extracterCache = new HashMap<String, List<MetadataExtracter>>(17);
|
||||||
|
embedderCache = new HashMap<String, List<MetadataEmbedder>>(17);
|
||||||
|
|
||||||
// create lock objects for access to the cache
|
// create lock objects for access to the cache
|
||||||
ReadWriteLock extractionCacheLock = new ReentrantReadWriteLock();
|
ReadWriteLock extractionCacheLock = new ReentrantReadWriteLock();
|
||||||
@@ -72,6 +74,7 @@ public class MetadataExtracterRegistry
|
|||||||
try
|
try
|
||||||
{
|
{
|
||||||
extracterCache.clear();
|
extracterCache.clear();
|
||||||
|
embedderCache.clear();
|
||||||
}
|
}
|
||||||
finally
|
finally
|
||||||
{
|
{
|
||||||
@@ -96,6 +99,7 @@ public class MetadataExtracterRegistry
|
|||||||
{
|
{
|
||||||
extracters.add(extracter);
|
extracters.add(extracter);
|
||||||
extracterCache.clear();
|
extracterCache.clear();
|
||||||
|
embedderCache.clear();
|
||||||
}
|
}
|
||||||
finally
|
finally
|
||||||
{
|
{
|
||||||
@@ -186,4 +190,92 @@ public class MetadataExtracterRegistry
|
|||||||
}
|
}
|
||||||
return extractors;
|
return extractors;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the best metadata embedder. This is a combination of the most
|
||||||
|
* reliable and the most performant embedder.
|
||||||
|
* <p>
|
||||||
|
* The result is cached for quicker access next time.
|
||||||
|
*
|
||||||
|
* @param mimetype the source MIME of the extraction
|
||||||
|
* @return Returns a metadata embedder that can embed metadata in the
|
||||||
|
* chosen MIME type.
|
||||||
|
*/
|
||||||
|
public MetadataEmbedder getEmbedder(String sourceMimetype)
|
||||||
|
{
|
||||||
|
List<MetadataEmbedder> embedders = null;
|
||||||
|
extracterCacheReadLock.lock();
|
||||||
|
try
|
||||||
|
{
|
||||||
|
if (embedderCache.containsKey(sourceMimetype))
|
||||||
|
{
|
||||||
|
// the translation has been requested before
|
||||||
|
// it might have been null
|
||||||
|
embedders = embedderCache.get(sourceMimetype);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
extracterCacheReadLock.unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (embedders == null)
|
||||||
|
{
|
||||||
|
// No request has been made before
|
||||||
|
// Get a write lock on the cache
|
||||||
|
// No double check done as it is not an expensive task
|
||||||
|
extracterCacheWriteLock.lock();
|
||||||
|
try
|
||||||
|
{
|
||||||
|
// find the most suitable transformer - may be empty list
|
||||||
|
embedders = findBestEmbedders(sourceMimetype);
|
||||||
|
// store the result even if it is null
|
||||||
|
embedderCache.put(sourceMimetype, embedders);
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
extracterCacheWriteLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We have the list of embedders that supposedly work (as registered).
|
||||||
|
// Take the last one that still claims to work
|
||||||
|
MetadataEmbedder liveEmbedder = null;
|
||||||
|
for (MetadataEmbedder embedder : embedders)
|
||||||
|
{
|
||||||
|
// An extractor may dynamically become unavailable
|
||||||
|
if (!embedder.isEmbeddingSupported(sourceMimetype))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
liveEmbedder = embedder;
|
||||||
|
}
|
||||||
|
return liveEmbedder;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param sourceMimetype The MIME type under examination
|
||||||
|
* @return Returns a set of embedders that will work for the given mimetype
|
||||||
|
*/
|
||||||
|
private List<MetadataEmbedder> findBestEmbedders(String sourceMimetype)
|
||||||
|
{
|
||||||
|
logger.debug("Finding embedders for " + sourceMimetype);
|
||||||
|
|
||||||
|
List<MetadataEmbedder> embedders = new ArrayList<MetadataEmbedder>(1);
|
||||||
|
|
||||||
|
for (MetadataExtracter extractor : extracters)
|
||||||
|
{
|
||||||
|
if (!(extractor instanceof MetadataEmbedder))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!((MetadataEmbedder) extractor).isEmbeddingSupported(sourceMimetype))
|
||||||
|
{
|
||||||
|
// extraction not achievable
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
embedders.add((MetadataEmbedder)extractor);
|
||||||
|
}
|
||||||
|
return embedders;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
* Copyright (C) 2005-2012 Alfresco Software Limited.
|
||||||
*
|
*
|
||||||
* This file is part of Alfresco
|
* This file is part of Alfresco
|
||||||
*
|
*
|
||||||
@@ -20,11 +20,13 @@ package org.alfresco.repo.content.metadata;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.text.DateFormat;
|
import java.text.DateFormat;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
@@ -35,8 +37,12 @@ import java.util.TimeZone;
|
|||||||
import org.alfresco.repo.content.MimetypeMap;
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
import org.alfresco.repo.content.filestore.FileContentReader;
|
import org.alfresco.repo.content.filestore.FileContentReader;
|
||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||||
|
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||||
|
import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.tika.embedder.Embedder;
|
||||||
import org.apache.tika.io.TemporaryResources;
|
import org.apache.tika.io.TemporaryResources;
|
||||||
import org.apache.tika.io.TikaInputStream;
|
import org.apache.tika.io.TikaInputStream;
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
@@ -72,7 +78,9 @@ import org.xml.sax.SAXException;
|
|||||||
* @since 3.4
|
* @since 3.4
|
||||||
* @author Nick Burch
|
* @author Nick Burch
|
||||||
*/
|
*/
|
||||||
public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetadataExtracter
|
public abstract class TikaPoweredMetadataExtracter
|
||||||
|
extends AbstractMappingMetadataExtracter
|
||||||
|
implements MetadataEmbedder
|
||||||
{
|
{
|
||||||
protected static Log logger = LogFactory.getLog(TikaPoweredMetadataExtracter.class);
|
protected static Log logger = LogFactory.getLog(TikaPoweredMetadataExtracter.class);
|
||||||
|
|
||||||
@@ -118,11 +126,19 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
|||||||
|
|
||||||
public TikaPoweredMetadataExtracter(ArrayList<String> supportedMimeTypes)
|
public TikaPoweredMetadataExtracter(ArrayList<String> supportedMimeTypes)
|
||||||
{
|
{
|
||||||
this(new HashSet<String>(supportedMimeTypes));
|
this(new HashSet<String>(supportedMimeTypes), null);
|
||||||
|
}
|
||||||
|
public TikaPoweredMetadataExtracter(ArrayList<String> supportedMimeTypes, ArrayList<String> supportedEmbedMimeTypes)
|
||||||
|
{
|
||||||
|
this(new HashSet<String>(supportedMimeTypes), new HashSet<String>(supportedEmbedMimeTypes));
|
||||||
}
|
}
|
||||||
public TikaPoweredMetadataExtracter(HashSet<String> supportedMimeTypes)
|
public TikaPoweredMetadataExtracter(HashSet<String> supportedMimeTypes)
|
||||||
{
|
{
|
||||||
super(supportedMimeTypes);
|
this(supportedMimeTypes, null);
|
||||||
|
}
|
||||||
|
public TikaPoweredMetadataExtracter(HashSet<String> supportedMimeTypes, HashSet<String> supportedEmbedMimeTypes)
|
||||||
|
{
|
||||||
|
super(supportedMimeTypes, supportedEmbedMimeTypes);
|
||||||
|
|
||||||
// TODO Once TIKA-451 is fixed this list will get nicer
|
// TODO Once TIKA-451 is fixed this list will get nicer
|
||||||
this.tikaDateFormats = new DateFormat[] {
|
this.tikaDateFormats = new DateFormat[] {
|
||||||
@@ -188,6 +204,18 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
|||||||
*/
|
*/
|
||||||
protected abstract Parser getParser();
|
protected abstract Parser getParser();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the Tika Embedder to modify
|
||||||
|
* the document.
|
||||||
|
*
|
||||||
|
* @return the Tika embedder
|
||||||
|
*/
|
||||||
|
protected Embedder getEmbedder()
|
||||||
|
{
|
||||||
|
// TODO make this an abstract method once more extracters support embedding
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Do we care about the contents of the
|
* Do we care about the contents of the
|
||||||
* extracted header, or nothing at all?
|
* extracted header, or nothing at all?
|
||||||
@@ -215,7 +243,7 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
|||||||
* For these cases, buffer out to a local file if not
|
* For these cases, buffer out to a local file if not
|
||||||
* already there
|
* already there
|
||||||
*/
|
*/
|
||||||
private InputStream getInputStream(ContentReader reader) throws IOException {
|
protected InputStream getInputStream(ContentReader reader) throws IOException {
|
||||||
// Prefer the File if available, it's generally quicker
|
// Prefer the File if available, it's generally quicker
|
||||||
if(reader instanceof FileContentReader)
|
if(reader instanceof FileContentReader)
|
||||||
{
|
{
|
||||||
@@ -338,6 +366,71 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
|||||||
return rawProperties;
|
return rawProperties;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void embedInternal(Map<String, Serializable> properties, ContentReader reader, ContentWriter writer) throws Throwable
|
||||||
|
{
|
||||||
|
Embedder embedder = getEmbedder();
|
||||||
|
if (embedder == null)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
OutputStream outputStream = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Metadata metadataToEmbed = new Metadata();
|
||||||
|
for (String metadataKey : properties.keySet())
|
||||||
|
{
|
||||||
|
Serializable value = properties.get(metadataKey);
|
||||||
|
if (value == null)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (value instanceof Collection<?>)
|
||||||
|
{
|
||||||
|
for (Object singleValue : (Collection<?>) value)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
// Convert to a string value for Tika
|
||||||
|
metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, singleValue));
|
||||||
|
}
|
||||||
|
catch (TypeConversionException e)
|
||||||
|
{
|
||||||
|
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
// Convert to a string value for Tika
|
||||||
|
metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, value));
|
||||||
|
}
|
||||||
|
catch (TypeConversionException e)
|
||||||
|
{
|
||||||
|
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
InputStream inputStream = getInputStream(reader);
|
||||||
|
outputStream = writer.getContentOutputStream();
|
||||||
|
embedder.embed(metadataToEmbed, inputStream, outputStream, null);
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
logger.error(e.getMessage(), e);
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (outputStream != null)
|
||||||
|
{
|
||||||
|
try { outputStream.close(); } catch (Throwable e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This content handler will capture entries from within
|
* This content handler will capture entries from within
|
||||||
* the header of the Tika content XHTML, but ignore the
|
* the header of the Tika content XHTML, but ignore the
|
||||||
|
Reference in New Issue
Block a user