mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Merged BRANCHES/DEV/RGAUSS/4.2-CORE-CHANGES-42861 to HEAD:
42862: Creating new branch from HEAD 43026: ALF-16403: Create the Basic Interfaces and Implementation for Metadata Embedders - Added MetadataEmbedder interface which guarantees an embed method responsible for writing the given metadata into a given content writer - Changed AbstractMappingMetadataExtracter to implement MetadataEmbedder * Added supportedEmbedMimetypes and constructor which takes it and supportedMimetypes as arguments * Added embedMapping * Added inheritDefaultEmbedMapping * Added isEmbeddingSupported * Added setEmbedMappingProperties * Added readEmbedMappingProperties for reading classname.embed.properties * Added setting of embedMapping in init method * Added checkIsEmbedSupported method * Added embed method which checks support for the mimetype, and calls embedInteral which implementations should override * Added mapSystemToRaw method, essentially a reverse of existing mapRawToSystem * Added getDefaultEmbedMapping method which assumes a reverse mapping of extract mapping if no explicit embed overrides are present * Added empty embedInternal method which does nothing rather than abstract method to minimize changes to existing code - Added notion of MetadataEmbedders to MetadataExtracterRegistry * Added embedderCache but use the existing extracterCache* locks * Added findBestEmbedders method * Added getEmbedder method 43164: ALF-16404: Create a Tika Powered Metadata Embedder - Added constructors for setting of supported embed types to TikaPoweredMetadataExtracter - Changed visibility of getInputStream to protected so subclasses can use it - Logging level changes in AbstractMappingMetadataExtracter 43165: ALF-16481: Create a Content Metadata Embedder Action Executer - Added ContentMetadataEmbedder action executer which gets an embedder for the noderef if available and sends the content reader and writer for the node ref to the embedder's embed method - Added embed-metadata action executer bean - Added embed-metadata action executer messages 43262: ALF-16404: Create a Tika Powered Metadata Embedder - Updated Tika which now contains implementation of TIKA-775: Embed Capabilities 43265: ALF-16404: Create a Tika Powered Metadata Embedder - Added MetadataEmbedder implementation to TikaPoweredMetadataExtracter which gets a Tika Embedder and calls its embed method git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@43268 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
* Copyright (C) 2005-2012 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
@@ -20,11 +20,13 @@ package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Serializable;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
@@ -35,8 +37,12 @@ import java.util.TimeZone;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.repo.content.filestore.FileContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||
import org.alfresco.service.cmr.repository.datatype.TypeConversionException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tika.embedder.Embedder;
|
||||
import org.apache.tika.io.TemporaryResources;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
@@ -72,7 +78,9 @@ import org.xml.sax.SAXException;
|
||||
* @since 3.4
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
public abstract class TikaPoweredMetadataExtracter
|
||||
extends AbstractMappingMetadataExtracter
|
||||
implements MetadataEmbedder
|
||||
{
|
||||
protected static Log logger = LogFactory.getLog(TikaPoweredMetadataExtracter.class);
|
||||
|
||||
@@ -118,11 +126,19 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
||||
|
||||
public TikaPoweredMetadataExtracter(ArrayList<String> supportedMimeTypes)
|
||||
{
|
||||
this(new HashSet<String>(supportedMimeTypes));
|
||||
this(new HashSet<String>(supportedMimeTypes), null);
|
||||
}
|
||||
public TikaPoweredMetadataExtracter(ArrayList<String> supportedMimeTypes, ArrayList<String> supportedEmbedMimeTypes)
|
||||
{
|
||||
this(new HashSet<String>(supportedMimeTypes), new HashSet<String>(supportedEmbedMimeTypes));
|
||||
}
|
||||
public TikaPoweredMetadataExtracter(HashSet<String> supportedMimeTypes)
|
||||
{
|
||||
super(supportedMimeTypes);
|
||||
this(supportedMimeTypes, null);
|
||||
}
|
||||
public TikaPoweredMetadataExtracter(HashSet<String> supportedMimeTypes, HashSet<String> supportedEmbedMimeTypes)
|
||||
{
|
||||
super(supportedMimeTypes, supportedEmbedMimeTypes);
|
||||
|
||||
// TODO Once TIKA-451 is fixed this list will get nicer
|
||||
this.tikaDateFormats = new DateFormat[] {
|
||||
@@ -188,6 +204,18 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
||||
*/
|
||||
protected abstract Parser getParser();
|
||||
|
||||
/**
|
||||
* Returns the Tika Embedder to modify
|
||||
* the document.
|
||||
*
|
||||
* @return the Tika embedder
|
||||
*/
|
||||
protected Embedder getEmbedder()
|
||||
{
|
||||
// TODO make this an abstract method once more extracters support embedding
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Do we care about the contents of the
|
||||
* extracted header, or nothing at all?
|
||||
@@ -215,7 +243,7 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
||||
* For these cases, buffer out to a local file if not
|
||||
* already there
|
||||
*/
|
||||
private InputStream getInputStream(ContentReader reader) throws IOException {
|
||||
protected InputStream getInputStream(ContentReader reader) throws IOException {
|
||||
// Prefer the File if available, it's generally quicker
|
||||
if(reader instanceof FileContentReader)
|
||||
{
|
||||
@@ -338,6 +366,71 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada
|
||||
return rawProperties;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void embedInternal(Map<String, Serializable> properties, ContentReader reader, ContentWriter writer) throws Throwable
|
||||
{
|
||||
Embedder embedder = getEmbedder();
|
||||
if (embedder == null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
OutputStream outputStream = null;
|
||||
try
|
||||
{
|
||||
Metadata metadataToEmbed = new Metadata();
|
||||
for (String metadataKey : properties.keySet())
|
||||
{
|
||||
Serializable value = properties.get(metadataKey);
|
||||
if (value == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (value instanceof Collection<?>)
|
||||
{
|
||||
for (Object singleValue : (Collection<?>) value)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Convert to a string value for Tika
|
||||
metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, singleValue));
|
||||
}
|
||||
catch (TypeConversionException e)
|
||||
{
|
||||
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
try
|
||||
{
|
||||
// Convert to a string value for Tika
|
||||
metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, value));
|
||||
}
|
||||
catch (TypeConversionException e)
|
||||
{
|
||||
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
InputStream inputStream = getInputStream(reader);
|
||||
outputStream = writer.getContentOutputStream();
|
||||
embedder.embed(metadataToEmbed, inputStream, outputStream, null);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.error(e.getMessage(), e);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (outputStream != null)
|
||||
{
|
||||
try { outputStream.close(); } catch (Throwable e) {}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* This content handler will capture entries from within
|
||||
* the header of the Tika content XHTML, but ignore the
|
||||
|
Reference in New Issue
Block a user