From 0105f86127520bd1b7eedacf515a37756eebc71e Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Wed, 29 Sep 2010 17:09:09 +0000 Subject: [PATCH] ALF-4448 - If a transformation fails, ask Tika if the mime type looks plausible, and if Tika thinks not, include the suggested real mime type in the exception git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22777 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- .../AbstractContentTransformer2.java | 83 +++++++++++++++++-- 1 file changed, 78 insertions(+), 5 deletions(-) diff --git a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformer2.java b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformer2.java index ba84ed3407..506099f6c7 100644 --- a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformer2.java +++ b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformer2.java @@ -27,6 +27,9 @@ import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.TransformationOptions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; /** * Provides basic services for {@link org.alfresco.repo.content.transform.ContentTransformer} @@ -46,6 +49,8 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp private double averageTime = 0.0; private long count = 0L; + private TikaConfig tikaConfig; + /** * All transformers start with an average transformation time of 0.0ms. */ @@ -167,11 +172,29 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp // will be prejudiced against transformers that tend to fail recordTime(60 * 1000); // 1 minute, i.e. rubbish - throw new ContentIOException("Content conversion failed: \n" + - " reader: " + reader + "\n" + - " writer: " + writer + "\n" + - " options: " + options, - e); + // Ask Tika to detect the document, and report back on if + // the current mime type is plausible + String differentType = checkMimeTypeMatches(reader.getReader()); + + // Report the error + if(differentType == null) + { + throw new ContentIOException("Content conversion failed: \n" + + " reader: " + reader + "\n" + + " writer: " + writer + "\n" + + " options: " + options, + e); + } + else + { + throw new ContentIOException("Content conversion failed: \n" + + " reader: " + reader + "\n" + + " writer: " + writer + "\n" + + " options: " + options + "\n" + + " claimed mime type: " + reader.getMimetype() + "\n" + + " detected mime type: " + differentType, + e); + } } finally { @@ -246,4 +269,54 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp double diffTime = ((double) transformationTime) - averageTime; averageTime += diffTime / (double) count; } + + /** + * Use Apache Tika to check if the mime type of the document really matches + * what it claims to be. + * This is typically used when a transformation fails, and you want to know + * if someone has renamed a file and consequently it has the wrong mime type. + * @return Null if the mime type seems ok, otherwise the mime type it probably is + */ + protected String checkMimeTypeMatches(ContentReader reader) + { + if(tikaConfig == null) + { + try { + tikaConfig = TikaConfig.getDefaultConfig(); + } catch(Exception e) { + logger.warn("Error creating Tika detector", e); + return null; + } + } + + Metadata metadata = new Metadata(); + MediaType type; + try { + type = tikaConfig.getMimeRepository().detect( + reader.getContentInputStream(), metadata + ); + logger.debug(reader + " detected by Tika as being " + type.toString()); + } catch(Exception e) { + logger.warn("Error identifying content type of problem document", e); + return null; + } + + // Is it a good match? + if(type.toString().equals(reader.getMimetype())) + { + return null; + } + + // Is it close? + MediaType claimed = MediaType.parse(reader.getMimetype()); + if(tikaConfig.getMediaTypeRegistry().isSpecializationOf(claimed, type) || + tikaConfig.getMediaTypeRegistry().isSpecializationOf(type, claimed)) + { + // Probably close enough + return null; + } + + // If we get here, then most likely the type is wrong + return type.toString(); + } }