diff --git a/source/java/org/alfresco/repo/action/executer/TransformActionExecuterTest.java b/source/java/org/alfresco/repo/action/executer/TransformActionExecuterTest.java index 11cc4d0c97..50d013943d 100644 --- a/source/java/org/alfresco/repo/action/executer/TransformActionExecuterTest.java +++ b/source/java/org/alfresco/repo/action/executer/TransformActionExecuterTest.java @@ -25,6 +25,7 @@ import java.util.Map; import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.encoding.ContentCharsetFinder; +import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.MimetypeService; import org.junit.Test; @@ -106,4 +107,5 @@ class DummyMimetypeService implements MimetypeService public Map getMimetypesByExtension() { return null; } public String guessMimetype(String filename) { return null; } public boolean isText(String mimetype) { return false;} + public String getMimetypeIfNotMatches(ContentReader reader) { return null; } } \ No newline at end of file diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java index 9e8c7a71b9..8dcb9c1672 100644 --- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java @@ -660,12 +660,24 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } catch (Throwable e) { + // Ask Tika to detect the document, and report back on if + // the current mime type is plausible + String typeErrorMessage = null; + String differentType = mimetypeService.getMimetypeIfNotMatches(reader.getReader()); + if(differentType != null) + { + typeErrorMessage = "\n" + + " claimed mime type: " + reader.getMimetype() + "\n" + + " detected mime type: " + differentType; + } + if (logger.isDebugEnabled()) { logger.debug( "Metadata extraction failed: \n" + " Extracter: " + this + "\n" + - " Content: " + reader, + " Content: " + reader + + typeErrorMessage, e); } else @@ -674,7 +686,8 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac "Metadata extraction failed (turn on DEBUG for full error): \n" + " Extracter: " + this + "\n" + " Content: " + reader + "\n" + - " Failure: " + e.getMessage()); + " Failure: " + e.getMessage() + + typeErrorMessage); } } finally diff --git a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformer2.java b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformer2.java index 506099f6c7..d5a65121d2 100644 --- a/source/java/org/alfresco/repo/content/transform/AbstractContentTransformer2.java +++ b/source/java/org/alfresco/repo/content/transform/AbstractContentTransformer2.java @@ -27,9 +27,6 @@ import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.TransformationOptions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; /** * Provides basic services for {@link org.alfresco.repo.content.transform.ContentTransformer} @@ -49,8 +46,6 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp private double averageTime = 0.0; private long count = 0L; - private TikaConfig tikaConfig; - /** * All transformers start with an average transformation time of 0.0ms. */ @@ -174,7 +169,7 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp // Ask Tika to detect the document, and report back on if // the current mime type is plausible - String differentType = checkMimeTypeMatches(reader.getReader()); + String differentType = getMimetypeService().getMimetypeIfNotMatches(reader.getReader()); // Report the error if(differentType == null) @@ -269,54 +264,4 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp double diffTime = ((double) transformationTime) - averageTime; averageTime += diffTime / (double) count; } - - /** - * Use Apache Tika to check if the mime type of the document really matches - * what it claims to be. - * This is typically used when a transformation fails, and you want to know - * if someone has renamed a file and consequently it has the wrong mime type. - * @return Null if the mime type seems ok, otherwise the mime type it probably is - */ - protected String checkMimeTypeMatches(ContentReader reader) - { - if(tikaConfig == null) - { - try { - tikaConfig = TikaConfig.getDefaultConfig(); - } catch(Exception e) { - logger.warn("Error creating Tika detector", e); - return null; - } - } - - Metadata metadata = new Metadata(); - MediaType type; - try { - type = tikaConfig.getMimeRepository().detect( - reader.getContentInputStream(), metadata - ); - logger.debug(reader + " detected by Tika as being " + type.toString()); - } catch(Exception e) { - logger.warn("Error identifying content type of problem document", e); - return null; - } - - // Is it a good match? - if(type.toString().equals(reader.getMimetype())) - { - return null; - } - - // Is it close? - MediaType claimed = MediaType.parse(reader.getMimetype()); - if(tikaConfig.getMediaTypeRegistry().isSpecializationOf(claimed, type) || - tikaConfig.getMediaTypeRegistry().isSpecializationOf(type, claimed)) - { - // Probably close enough - return null; - } - - // If we get here, then most likely the type is wrong - return type.toString(); - } }