ALF-17703: TikaPowerMetadataExtracter Destroys Content on Failed Embed

- Removed catch of exception and closing of output stream in TikaPoweredMetadataExtracter to allow AbstractMappingMetadataExtracter to better handle the error
   - Added catch of ContentIOException during construction of error details in AbstractMappingMetadataExtracter
   - Added ContentMetadataEmbedderTest to test a failing embedder

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@45949 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Ray Gauss
2013-01-28 19:30:31 +00:00
parent c7aea42dc2
commit adef628ee9
5 changed files with 248 additions and 43 deletions

View File

@@ -26,6 +26,7 @@ import org.alfresco.repo.action.evaluator.ComparePropertyValueEvaluatorTest;
import org.alfresco.repo.action.evaluator.HasAspectEvaluatorTest; import org.alfresco.repo.action.evaluator.HasAspectEvaluatorTest;
import org.alfresco.repo.action.evaluator.IsSubTypeEvaluatorTest; import org.alfresco.repo.action.evaluator.IsSubTypeEvaluatorTest;
import org.alfresco.repo.action.executer.AddFeaturesActionExecuterTest; import org.alfresco.repo.action.executer.AddFeaturesActionExecuterTest;
import org.alfresco.repo.action.executer.ContentMetadataEmbedderTest;
import org.alfresco.repo.action.executer.ContentMetadataExtracterTest; import org.alfresco.repo.action.executer.ContentMetadataExtracterTest;
import org.alfresco.repo.action.executer.RemoveFeaturesActionExecuterTest; import org.alfresco.repo.action.executer.RemoveFeaturesActionExecuterTest;
import org.alfresco.repo.action.executer.SetPropertyValueActionExecuterTest; import org.alfresco.repo.action.executer.SetPropertyValueActionExecuterTest;
@@ -66,6 +67,7 @@ public class ActionTestSuite extends TestSuite
suite.addTestSuite(SetPropertyValueActionExecuterTest.class); suite.addTestSuite(SetPropertyValueActionExecuterTest.class);
suite.addTestSuite(AddFeaturesActionExecuterTest.class); suite.addTestSuite(AddFeaturesActionExecuterTest.class);
suite.addTestSuite(ContentMetadataExtracterTest.class); suite.addTestSuite(ContentMetadataExtracterTest.class);
suite.addTestSuite(ContentMetadataEmbedderTest.class);
suite.addTestSuite(SpecialiseTypeActionExecuterTest.class); suite.addTestSuite(SpecialiseTypeActionExecuterTest.class);
suite.addTestSuite(RemoveFeaturesActionExecuterTest.class); suite.addTestSuite(RemoveFeaturesActionExecuterTest.class);
suite.addTestSuite(ActionTrackingServiceImplTest.class); suite.addTestSuite(ActionTrackingServiceImplTest.class);

View File

@@ -140,7 +140,7 @@ public class ContentMetadataEmbedder extends ActionExecuterAbstractBase
if (logger.isDebugEnabled()) if (logger.isDebugEnabled())
{ {
logger.debug( logger.debug(
"Meetadata embedding failed: \n" + "Metadata embedding failed: \n" +
" Extracter: " + this + "\n" + " Extracter: " + this + "\n" +
" Node: " + actionedUponNodeRef + "\n" + " Node: " + actionedUponNodeRef + "\n" +
" Content: " + writer, " Content: " + writer,

View File

@@ -0,0 +1,210 @@
/*
* Copyright (C) 2005-2012 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.action.executer;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.action.ActionImpl;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.metadata.MetadataExtracterRegistry;
import org.alfresco.repo.content.metadata.TikaPoweredMetadataExtracter;
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
import org.alfresco.repo.security.authentication.AuthenticationComponent;
import org.alfresco.service.cmr.dictionary.DictionaryService;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentService;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.MimetypeService;
import org.alfresco.service.cmr.repository.NodeRef;
import org.alfresco.service.cmr.repository.NodeService;
import org.alfresco.service.cmr.repository.StoreRef;
import org.alfresco.service.namespace.QName;
import org.alfresco.util.BaseSpringTest;
import org.alfresco.util.GUID;
import org.apache.tika.embedder.Embedder;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
/**
* Test of the ActionExecuter for embedding metadata
*
* @author Ray Gauss II
*/
public class ContentMetadataEmbedderTest extends BaseSpringTest
{
private NodeService nodeService;
private ContentService contentService;
private DictionaryService dictionaryService;
private MimetypeService mimetypeService;
private StoreRef testStoreRef;
private NodeRef rootNodeRef;
private NodeRef nodeRef;
private ContentMetadataEmbedder executer;
private final static String ID = GUID.generate();
@Override
protected void onSetUpInTransaction() throws Exception
{
this.nodeService = (NodeService) this.applicationContext.getBean("nodeService");
this.contentService = (ContentService) this.applicationContext.getBean("contentService");
this.dictionaryService = (DictionaryService) this.applicationContext.getBean("dictionaryService");
this.mimetypeService = (MimetypeService) this.applicationContext.getBean("mimetypeService");
AuthenticationComponent authenticationComponent = (AuthenticationComponent)applicationContext.getBean("authenticationComponent");
authenticationComponent.setSystemUserAsCurrentUser();
// Create the store and get the root node
this.testStoreRef = this.nodeService.createStore(
StoreRef.PROTOCOL_WORKSPACE,
"Test_" + System.currentTimeMillis());
this.rootNodeRef = this.nodeService.getRootNode(this.testStoreRef);
// Create the node used for tests
this.nodeRef = this.nodeService.createNode(
this.rootNodeRef, ContentModel.ASSOC_CHILDREN,
QName.createQName("{test}testnode"),
ContentModel.TYPE_CONTENT).getChildRef();
// Setup the content from the PDF test data
ContentWriter cw = this.contentService.getWriter(nodeRef, ContentModel.PROP_CONTENT, true);
cw.setMimetype(MimetypeMap.MIMETYPE_PDF);
cw.putContent(AbstractContentTransformerTest.loadQuickTestFile("pdf"));
// Get the executer instance
this.executer = (ContentMetadataEmbedder) this.applicationContext.getBean("embed-metadata");
}
/**
* Test that a failing embedder does not destroy the original content
*/
public void testFailingEmbedder()
{
MetadataExtracterRegistry registry = (MetadataExtracterRegistry) applicationContext.getBean("metadataExtracterRegistry");
FailingEmbedder embedder = new FailingEmbedder(Arrays.asList(MimetypeMap.MIMETYPE_PDF));
embedder.setRegistry(registry);
embedder.setDictionaryService(this.dictionaryService);
embedder.setMimetypeService(this.mimetypeService);
embedder.register();
String myCreator = "Embedded creator";
// Get the old props
Map<QName, Serializable> props = this.nodeService.getProperties(this.nodeRef);
props.put(ContentModel.PROP_AUTHOR, myCreator);
this.nodeService.setProperties(this.nodeRef, props);
// Execute the action
ActionImpl action = new ActionImpl(null, ID, SetPropertyValueActionExecuter.NAME, null);
ContentReader origReader = this.contentService.getReader(this.nodeRef, ContentModel.PROP_CONTENT);
long origSize = origReader.getSize();
assertTrue(origSize > 0);
this.executer.execute(action, this.nodeRef);
ContentReader embeddedReader = this.contentService.getReader(this.nodeRef, ContentModel.PROP_CONTENT);
assertEquals("The original content should remain unchanged on embed failures", origSize, embeddedReader.getSize());
}
/**
* Tika-powered embedder which fails upon calling embed on its {@link FailingTikaEmbedder}
*/
private class FailingEmbedder extends TikaPoweredMetadataExtracter
{
/**
* Constructor for setting supported extract and embed mimetypes
*
* @param mimetypes the supported extract and embed mimetypes
*/
public FailingEmbedder(Collection<String> mimetypes)
{
super(
new HashSet<String>(mimetypes),
new HashSet<String>(mimetypes));
}
@Override
protected Parser getParser()
{
return null;
}
@Override
protected Embedder getEmbedder()
{
return new FailingTikaEmbedder();
}
@Override
protected Map<String, Set<QName>> readMappingProperties(String propertiesUrl)
{
return null;
}
@Override
protected Map<String, Set<QName>> getDefaultMapping()
{
Map<String, Set<QName>> mapping = new HashMap<String, Set<QName>>(1);
Set<QName> qnames = new HashSet<QName>(1);
qnames.add(ContentModel.PROP_AUTHOR);
mapping.put("author", qnames);
return mapping;
}
}
/**
* Tika metadata embedder which fails on a call to embed.
*/
private class FailingTikaEmbedder implements Embedder
{
private static final long serialVersionUID = -4954679684941467571L;
@Override
public Set<MediaType> getSupportedEmbedTypes(ParseContext context)
{
return null;
}
@Override
public void embed(Metadata metadata, InputStream originalStream, OutputStream outputStream, ParseContext context)
throws IOException, TikaException
{
throw new IOException("Forced failure");
}
}
}

View File

@@ -42,6 +42,7 @@ import org.alfresco.model.ContentModel;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition; import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
import org.alfresco.service.cmr.dictionary.DictionaryService; import org.alfresco.service.cmr.dictionary.DictionaryService;
import org.alfresco.service.cmr.dictionary.PropertyDefinition; import org.alfresco.service.cmr.dictionary.PropertyDefinition;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.MalformedNodeRefException; import org.alfresco.service.cmr.repository.MalformedNodeRefException;
@@ -1115,11 +1116,18 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
{ {
// Ask Tika to detect the document, and report back on if // Ask Tika to detect the document, and report back on if
// the current mime type is plausible // the current mime type is plausible
String typeErrorMessage = null; String typeErrorMessage = "";
String differentType = null; String differentType = null;
if(mimetypeService != null) if(mimetypeService != null)
{ {
differentType = mimetypeService.getMimetypeIfNotMatches(writer.getReader()); try
{
differentType = mimetypeService.getMimetypeIfNotMatches(writer.getReader());
}
catch (ContentIOException cioe)
{
// Embedding failed and writer is empty
}
} }
else else
{ {
@@ -1144,7 +1152,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
} }
else else
{ {
logger.warn( logger.error(
"Metadata embedding failed (turn on DEBUG for full error): \n" + "Metadata embedding failed (turn on DEBUG for full error): \n" +
" Extracter: " + this + "\n" + " Extracter: " + this + "\n" +
" Content: " + writer + "\n" + " Content: " + writer + "\n" +

View File

@@ -375,38 +375,23 @@ public abstract class TikaPoweredMetadataExtracter
{ {
return; return;
} }
OutputStream outputStream = null;
try Metadata metadataToEmbed = new Metadata();
for (String metadataKey : properties.keySet())
{ {
Metadata metadataToEmbed = new Metadata(); Serializable value = properties.get(metadataKey);
for (String metadataKey : properties.keySet()) if (value == null)
{ {
Serializable value = properties.get(metadataKey); continue;
if (value == null) }
{ if (value instanceof Collection<?>)
continue; {
} for (Object singleValue : (Collection<?>) value)
if (value instanceof Collection<?>)
{
for (Object singleValue : (Collection<?>) value)
{
try
{
// Convert to a string value for Tika
metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, singleValue));
}
catch (TypeConversionException e)
{
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
}
}
}
else
{ {
try try
{ {
// Convert to a string value for Tika // Convert to a string value for Tika
metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, value)); metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, singleValue));
} }
catch (TypeConversionException e) catch (TypeConversionException e)
{ {
@@ -414,22 +399,22 @@ public abstract class TikaPoweredMetadataExtracter
} }
} }
} }
InputStream inputStream = getInputStream(reader); else
outputStream = writer.getContentOutputStream();
embedder.embed(metadataToEmbed, inputStream, outputStream, null);
}
catch (Exception e)
{
logger.error(e.getMessage(), e);
}
finally
{
if (outputStream != null)
{ {
try { outputStream.close(); } catch (Throwable e) {} try
{
// Convert to a string value for Tika
metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, value));
}
catch (TypeConversionException e)
{
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
}
} }
} }
InputStream inputStream = getInputStream(reader);
OutputStream outputStream = writer.getContentOutputStream();
embedder.embed(metadataToEmbed, inputStream, outputStream, null);
} }
/** /**