Merged BRANCHES/DEV/RGAUSS/4.2-CORE-CHANGES-42861 to HEAD:

42862: Creating new branch from HEAD
   43026: ALF-16403: Create the Basic Interfaces and Implementation for Metadata Embedders
        - Added MetadataEmbedder interface which guarantees an embed method responsible for writing the given metadata into a given content writer
        - Changed AbstractMappingMetadataExtracter to implement MetadataEmbedder
           * Added supportedEmbedMimetypes and constructor which takes it and supportedMimetypes as arguments
           * Added embedMapping
           * Added inheritDefaultEmbedMapping
           * Added isEmbeddingSupported
           * Added setEmbedMappingProperties
           * Added readEmbedMappingProperties for reading classname.embed.properties
           * Added setting of embedMapping in init method
           * Added checkIsEmbedSupported method
           * Added embed method which checks support for the mimetype, and calls embedInteral which implementations should override
           * Added mapSystemToRaw method, essentially a reverse of existing mapRawToSystem
           * Added getDefaultEmbedMapping method which assumes a reverse mapping of extract mapping if no explicit embed overrides are present
           * Added empty embedInternal method which does nothing rather than abstract method to minimize changes to existing code
        - Added notion of MetadataEmbedders to MetadataExtracterRegistry
           * Added embedderCache but use the existing extracterCache* locks
           * Added findBestEmbedders method
           * Added getEmbedder method
   43164: ALF-16404: Create a Tika Powered Metadata Embedder
        - Added constructors for setting of supported embed types to TikaPoweredMetadataExtracter
        - Changed visibility of getInputStream to protected so subclasses can use it
        - Logging level changes in AbstractMappingMetadataExtracter
   43165: ALF-16481: Create a Content Metadata Embedder Action Executer
        - Added ContentMetadataEmbedder action executer which gets an embedder for the noderef if available and sends the content reader and writer for the node ref to the embedder's embed method
        - Added embed-metadata action executer bean
        - Added embed-metadata action executer messages
   43262: ALF-16404: Create a Tika Powered Metadata Embedder
        - Updated Tika which now contains implementation of TIKA-775: Embed Capabilities
   43265: ALF-16404: Create a Tika Powered Metadata Embedder
        - Added MetadataEmbedder implementation to TikaPoweredMetadataExtracter which gets a Tika Embedder and calls its embed method


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@43268 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Ray Gauss
2012-10-31 14:33:09 +00:00
parent 2cb4640004
commit 918696927d
7 changed files with 974 additions and 16 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
* Copyright (C) 2005-2012 Alfresco Software Limited.
*
* This file is part of Alfresco
*
@@ -20,11 +20,13 @@ package org.alfresco.repo.content.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
@@ -35,8 +37,12 @@ import java.util.TimeZone;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.filestore.FileContentReader;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.embedder.Embedder;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -72,7 +78,9 @@ import org.xml.sax.SAXException;
* @since 3.4
* @author Nick Burch
*/
public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetadataExtracter
public abstract class TikaPoweredMetadataExtracter
extends AbstractMappingMetadataExtracter
implements MetadataEmbedder
{
protected static Log logger = LogFactory.getLog(TikaPoweredMetadataExtracter.class);
@@ -118,11 +126,19 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
public TikaPoweredMetadataExtracter(ArrayList<String> supportedMimeTypes)
{
this(new HashSet<String>(supportedMimeTypes));
this(new HashSet<String>(supportedMimeTypes), null);
}
public TikaPoweredMetadataExtracter(ArrayList<String> supportedMimeTypes, ArrayList<String> supportedEmbedMimeTypes)
{
this(new HashSet<String>(supportedMimeTypes), new HashSet<String>(supportedEmbedMimeTypes));
}
public TikaPoweredMetadataExtracter(HashSet<String> supportedMimeTypes)
{
super(supportedMimeTypes);
this(supportedMimeTypes, null);
}
public TikaPoweredMetadataExtracter(HashSet<String> supportedMimeTypes, HashSet<String> supportedEmbedMimeTypes)
{
super(supportedMimeTypes, supportedEmbedMimeTypes);
// TODO Once TIKA-451 is fixed this list will get nicer
this.tikaDateFormats = new DateFormat[] {
@@ -188,6 +204,18 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
*/
protected abstract Parser getParser();
/**
* Returns the Tika Embedder to modify
* the document.
*
* @return the Tika embedder
*/
protected Embedder getEmbedder()
{
// TODO make this an abstract method once more extracters support embedding
return null;
}
/**
* Do we care about the contents of the
* extracted header, or nothing at all?
@@ -215,7 +243,7 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
* For these cases, buffer out to a local file if not
* already there
*/
private InputStream getInputStream(ContentReader reader) throws IOException {
protected InputStream getInputStream(ContentReader reader) throws IOException {
// Prefer the File if available, it's generally quicker
if(reader instanceof FileContentReader)
{
@@ -338,6 +366,71 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
return rawProperties;
}
@Override
protected void embedInternal(Map<String, Serializable> properties, ContentReader reader, ContentWriter writer) throws Throwable
{
Embedder embedder = getEmbedder();
if (embedder == null)
{
return;
}
OutputStream outputStream = null;
try
{
Metadata metadataToEmbed = new Metadata();
for (String metadataKey : properties.keySet())
{
Serializable value = properties.get(metadataKey);
if (value == null)
{
continue;
}
if (value instanceof Collection<?>)
{
for (Object singleValue : (Collection<?>) value)
{
try
{
// Convert to a string value for Tika
metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, singleValue));
}
catch (TypeConversionException e)
{
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
}
}
}
else
{
try
{
// Convert to a string value for Tika
metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, value));
}
catch (TypeConversionException e)
{
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
}
}
}
InputStream inputStream = getInputStream(reader);
outputStream = writer.getContentOutputStream();
embedder.embed(metadataToEmbed, inputStream, outputStream, null);
}
catch (Exception e)
{
logger.error(e.getMessage(), e);
}
finally
{
if (outputStream != null)
{
try { outputStream.close(); } catch (Throwable e) {}
}
}
}
/**
* This content handler will capture entries from within
* the header of the Tika content XHTML, but ignore the