When metadata extraction fails, perform the same check that content transformation now does, and give a more helpful error message if the mime type is wrong on the content

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22878 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-10-05 11:04:55 +00:00
parent f62629d5d4
commit e08fd4e940
3 changed files with 18 additions and 58 deletions

View File

@@ -25,6 +25,7 @@ import java.util.Map;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.encoding.ContentCharsetFinder; import org.alfresco.repo.content.encoding.ContentCharsetFinder;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.MimetypeService; import org.alfresco.service.cmr.repository.MimetypeService;
import org.junit.Test; import org.junit.Test;
@@ -106,4 +107,5 @@ class DummyMimetypeService implements MimetypeService
public Map<String, String> getMimetypesByExtension() { return null; } public Map<String, String> getMimetypesByExtension() { return null; }
public String guessMimetype(String filename) { return null; } public String guessMimetype(String filename) { return null; }
public boolean isText(String mimetype) { return false;} public boolean isText(String mimetype) { return false;}
public String getMimetypeIfNotMatches(ContentReader reader) { return null; }
} }

View File

@@ -660,12 +660,24 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
} }
catch (Throwable e) catch (Throwable e)
{ {
// Ask Tika to detect the document, and report back on if
// the current mime type is plausible
String typeErrorMessage = null;
String differentType = mimetypeService.getMimetypeIfNotMatches(reader.getReader());
if(differentType != null)
{
typeErrorMessage = "\n" +
" claimed mime type: " + reader.getMimetype() + "\n" +
" detected mime type: " + differentType;
}
if (logger.isDebugEnabled()) if (logger.isDebugEnabled())
{ {
logger.debug( logger.debug(
"Metadata extraction failed: \n" + "Metadata extraction failed: \n" +
" Extracter: " + this + "\n" + " Extracter: " + this + "\n" +
" Content: " + reader, " Content: " + reader +
typeErrorMessage,
e); e);
} }
else else
@@ -674,7 +686,8 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
"Metadata extraction failed (turn on DEBUG for full error): \n" + "Metadata extraction failed (turn on DEBUG for full error): \n" +
" Extracter: " + this + "\n" + " Extracter: " + this + "\n" +
" Content: " + reader + "\n" + " Content: " + reader + "\n" +
" Failure: " + e.getMessage()); " Failure: " + e.getMessage() +
typeErrorMessage);
} }
} }
finally finally

View File

@@ -27,9 +27,6 @@ import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions; import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/** /**
* Provides basic services for {@link org.alfresco.repo.content.transform.ContentTransformer} * Provides basic services for {@link org.alfresco.repo.content.transform.ContentTransformer}
@@ -49,8 +46,6 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp
private double averageTime = 0.0; private double averageTime = 0.0;
private long count = 0L; private long count = 0L;
private TikaConfig tikaConfig;
/** /**
* All transformers start with an average transformation time of 0.0ms. * All transformers start with an average transformation time of 0.0ms.
*/ */
@@ -174,7 +169,7 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp
// Ask Tika to detect the document, and report back on if // Ask Tika to detect the document, and report back on if
// the current mime type is plausible // the current mime type is plausible
String differentType = checkMimeTypeMatches(reader.getReader()); String differentType = getMimetypeService().getMimetypeIfNotMatches(reader.getReader());
// Report the error // Report the error
if(differentType == null) if(differentType == null)
@@ -269,54 +264,4 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp
double diffTime = ((double) transformationTime) - averageTime; double diffTime = ((double) transformationTime) - averageTime;
averageTime += diffTime / (double) count; averageTime += diffTime / (double) count;
} }
/**
* Use Apache Tika to check if the mime type of the document really matches
* what it claims to be.
* This is typically used when a transformation fails, and you want to know
* if someone has renamed a file and consequently it has the wrong mime type.
* @return Null if the mime type seems ok, otherwise the mime type it probably is
*/
protected String checkMimeTypeMatches(ContentReader reader)
{
if(tikaConfig == null)
{
try {
tikaConfig = TikaConfig.getDefaultConfig();
} catch(Exception e) {
logger.warn("Error creating Tika detector", e);
return null;
}
}
Metadata metadata = new Metadata();
MediaType type;
try {
type = tikaConfig.getMimeRepository().detect(
reader.getContentInputStream(), metadata
);
logger.debug(reader + " detected by Tika as being " + type.toString());
} catch(Exception e) {
logger.warn("Error identifying content type of problem document", e);
return null;
}
// Is it a good match?
if(type.toString().equals(reader.getMimetype()))
{
return null;
}
// Is it close?
MediaType claimed = MediaType.parse(reader.getMimetype());
if(tikaConfig.getMediaTypeRegistry().isSpecializationOf(claimed, type) ||
tikaConfig.getMediaTypeRegistry().isSpecializationOf(type, claimed))
{
// Probably close enough
return null;
}
// If we get here, then most likely the type is wrong
return type.toString();
}
} }