mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
When metadata extraction fails, perform the same check that content transformation now does, and give a more helpful error message if the mime type is wrong on the content
git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22878 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -25,6 +25,7 @@ import java.util.Map;
|
|||||||
|
|
||||||
import org.alfresco.repo.content.MimetypeMap;
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
import org.alfresco.repo.content.encoding.ContentCharsetFinder;
|
import org.alfresco.repo.content.encoding.ContentCharsetFinder;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
import org.alfresco.service.cmr.repository.MimetypeService;
|
import org.alfresco.service.cmr.repository.MimetypeService;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
@@ -106,4 +107,5 @@ class DummyMimetypeService implements MimetypeService
|
|||||||
public Map<String, String> getMimetypesByExtension() { return null; }
|
public Map<String, String> getMimetypesByExtension() { return null; }
|
||||||
public String guessMimetype(String filename) { return null; }
|
public String guessMimetype(String filename) { return null; }
|
||||||
public boolean isText(String mimetype) { return false;}
|
public boolean isText(String mimetype) { return false;}
|
||||||
|
public String getMimetypeIfNotMatches(ContentReader reader) { return null; }
|
||||||
}
|
}
|
@@ -660,12 +660,24 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
}
|
}
|
||||||
catch (Throwable e)
|
catch (Throwable e)
|
||||||
{
|
{
|
||||||
|
// Ask Tika to detect the document, and report back on if
|
||||||
|
// the current mime type is plausible
|
||||||
|
String typeErrorMessage = null;
|
||||||
|
String differentType = mimetypeService.getMimetypeIfNotMatches(reader.getReader());
|
||||||
|
if(differentType != null)
|
||||||
|
{
|
||||||
|
typeErrorMessage = "\n" +
|
||||||
|
" claimed mime type: " + reader.getMimetype() + "\n" +
|
||||||
|
" detected mime type: " + differentType;
|
||||||
|
}
|
||||||
|
|
||||||
if (logger.isDebugEnabled())
|
if (logger.isDebugEnabled())
|
||||||
{
|
{
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Metadata extraction failed: \n" +
|
"Metadata extraction failed: \n" +
|
||||||
" Extracter: " + this + "\n" +
|
" Extracter: " + this + "\n" +
|
||||||
" Content: " + reader,
|
" Content: " + reader +
|
||||||
|
typeErrorMessage,
|
||||||
e);
|
e);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -674,7 +686,8 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
"Metadata extraction failed (turn on DEBUG for full error): \n" +
|
"Metadata extraction failed (turn on DEBUG for full error): \n" +
|
||||||
" Extracter: " + this + "\n" +
|
" Extracter: " + this + "\n" +
|
||||||
" Content: " + reader + "\n" +
|
" Content: " + reader + "\n" +
|
||||||
" Failure: " + e.getMessage());
|
" Failure: " + e.getMessage() +
|
||||||
|
typeErrorMessage);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
finally
|
finally
|
||||||
|
@@ -27,9 +27,6 @@ import org.alfresco.service.cmr.repository.ContentWriter;
|
|||||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.tika.config.TikaConfig;
|
|
||||||
import org.apache.tika.metadata.Metadata;
|
|
||||||
import org.apache.tika.mime.MediaType;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides basic services for {@link org.alfresco.repo.content.transform.ContentTransformer}
|
* Provides basic services for {@link org.alfresco.repo.content.transform.ContentTransformer}
|
||||||
@@ -49,8 +46,6 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp
|
|||||||
private double averageTime = 0.0;
|
private double averageTime = 0.0;
|
||||||
private long count = 0L;
|
private long count = 0L;
|
||||||
|
|
||||||
private TikaConfig tikaConfig;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* All transformers start with an average transformation time of 0.0ms.
|
* All transformers start with an average transformation time of 0.0ms.
|
||||||
*/
|
*/
|
||||||
@@ -174,7 +169,7 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp
|
|||||||
|
|
||||||
// Ask Tika to detect the document, and report back on if
|
// Ask Tika to detect the document, and report back on if
|
||||||
// the current mime type is plausible
|
// the current mime type is plausible
|
||||||
String differentType = checkMimeTypeMatches(reader.getReader());
|
String differentType = getMimetypeService().getMimetypeIfNotMatches(reader.getReader());
|
||||||
|
|
||||||
// Report the error
|
// Report the error
|
||||||
if(differentType == null)
|
if(differentType == null)
|
||||||
@@ -269,54 +264,4 @@ public abstract class AbstractContentTransformer2 extends ContentTransformerHelp
|
|||||||
double diffTime = ((double) transformationTime) - averageTime;
|
double diffTime = ((double) transformationTime) - averageTime;
|
||||||
averageTime += diffTime / (double) count;
|
averageTime += diffTime / (double) count;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Use Apache Tika to check if the mime type of the document really matches
|
|
||||||
* what it claims to be.
|
|
||||||
* This is typically used when a transformation fails, and you want to know
|
|
||||||
* if someone has renamed a file and consequently it has the wrong mime type.
|
|
||||||
* @return Null if the mime type seems ok, otherwise the mime type it probably is
|
|
||||||
*/
|
|
||||||
protected String checkMimeTypeMatches(ContentReader reader)
|
|
||||||
{
|
|
||||||
if(tikaConfig == null)
|
|
||||||
{
|
|
||||||
try {
|
|
||||||
tikaConfig = TikaConfig.getDefaultConfig();
|
|
||||||
} catch(Exception e) {
|
|
||||||
logger.warn("Error creating Tika detector", e);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Metadata metadata = new Metadata();
|
|
||||||
MediaType type;
|
|
||||||
try {
|
|
||||||
type = tikaConfig.getMimeRepository().detect(
|
|
||||||
reader.getContentInputStream(), metadata
|
|
||||||
);
|
|
||||||
logger.debug(reader + " detected by Tika as being " + type.toString());
|
|
||||||
} catch(Exception e) {
|
|
||||||
logger.warn("Error identifying content type of problem document", e);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Is it a good match?
|
|
||||||
if(type.toString().equals(reader.getMimetype()))
|
|
||||||
{
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Is it close?
|
|
||||||
MediaType claimed = MediaType.parse(reader.getMimetype());
|
|
||||||
if(tikaConfig.getMediaTypeRegistry().isSpecializationOf(claimed, type) ||
|
|
||||||
tikaConfig.getMediaTypeRegistry().isSpecializationOf(type, claimed))
|
|
||||||
{
|
|
||||||
// Probably close enough
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we get here, then most likely the type is wrong
|
|
||||||
return type.toString();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user