MNT-18275 Change detected mimetype: pdf->ai ps->eps if extension correct

This commit is contained in:
Alan Davis
2017-09-28 21:44:41 +01:00
parent 900f8f4529
commit 3e54ada31e
2 changed files with 80 additions and 33 deletions

View File

@@ -25,20 +25,6 @@
*/
package org.alfresco.repo.content;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import org.alfresco.repo.content.encoding.ContentCharsetFinder;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.FileContentReader;
@@ -57,6 +43,20 @@ import org.springframework.extensions.config.ConfigElement;
import org.springframework.extensions.config.ConfigLookupContext;
import org.springframework.extensions.config.ConfigService;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
/**
* Provides a bidirectional mapping between well-known mimetypes and the
* registered file extensions. All mimetypes and extensions are stored and
@@ -719,6 +719,7 @@ public class MimetypeMap implements MimetypeService
try
{
type = detector.detect(inp, metadata);
type = typeBasedOnDetectedTypeAndExtension(type, filename);
logger.debug(input + " detected by Tika as being " + type.toString());
}
catch (Exception e)
@@ -743,6 +744,36 @@ public class MimetypeMap implements MimetypeService
return type;
}
// We have a problem with .ai files, as Tika detects them as .pdf, but if we can use the filename
// we can correct that. Similar problem with .eps and .ps.
private MediaType typeBasedOnDetectedTypeAndExtension(MediaType type, String filename)
{
if (filename != null && type != null)
{
String[] detectedAndPossibleTypes = new String[]
{
MIMETYPE_PDF, MIMETYPE_APPLICATION_ILLUSTRATOR,
MIMETYPE_APPLICATION_PS, MIMETYPE_APPLICATION_EPS
};
for (int i=detectedAndPossibleTypes.length-1; i>=0; i-=2)
{
String detectedType = detectedAndPossibleTypes[i-1];
if (detectedType.equals(type.toString()))
{
String possibleType = detectedAndPossibleTypes[i];
String extension = getExtension(possibleType);
if (filename.endsWith("."+extension))
{
type = MediaType.parse(possibleType);
break;
}
}
}
}
return type;
}
/**
* Use Apache Tika to check if the mime type of the document really matches
* what it claims to be. This is typically used when a transformation or

View File

@@ -25,20 +25,7 @@
*/
package org.alfresco.repo.content;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import junit.framework.TestCase;
import org.alfresco.service.cmr.repository.ContentData;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
@@ -52,7 +39,19 @@ import org.springframework.extensions.config.ConfigService;
import org.springframework.extensions.config.ConfigSource;
import org.springframework.extensions.config.xml.XMLConfigService;
import junit.framework.TestCase;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
/**
* @see org.alfresco.repo.content.MimetypeMap
@@ -238,6 +237,17 @@ public class MimetypeMapTest extends TestCase
assertEquals(MimetypeMap.MIMETYPE_VIDEO_QUICKTIME, mimetypeService.guessMimetype("file.rm", reader));
}
public void testTypeBasedOnDetectedTypeAndExtension() throws Exception
{
ContentReader reader = new DummyContentReader(MimetypeMap.MIMETYPE_PDF, "%PDF\r");
assertEquals(MimetypeMap.MIMETYPE_APPLICATION_ILLUSTRATOR, mimetypeService.guessMimetype("file.ai", reader.getContentInputStream()));
assertEquals(MimetypeMap.MIMETYPE_PDF, mimetypeService.guessMimetype("file.pdf", reader.getContentInputStream()));
reader = new DummyContentReader(MimetypeMap.MIMETYPE_APPLICATION_PS, "%!PS");
assertEquals(MimetypeMap.MIMETYPE_APPLICATION_EPS, mimetypeService.guessMimetype("file.eps", reader.getContentInputStream()));
assertEquals(MimetypeMap.MIMETYPE_APPLICATION_PS, mimetypeService.guessMimetype("file.ps", reader.getContentInputStream()));
}
public void testDuplicates() throws Exception
{
setConfigService(
@@ -288,17 +298,23 @@ public class MimetypeMapTest extends TestCase
public static class DummyContentReader implements ContentReader
{
private String mimetype = MimetypeMap.MIMETYPE_HTML;
private String mimetype;
private String content;
public DummyContentReader()
{
super();
this(MimetypeMap.MIMETYPE_HTML);
}
public DummyContentReader(String mimetype)
{
this(mimetype, "<X>@@/Y");
}
public DummyContentReader(String mimetype, String content)
{
this.mimetype = mimetype;
this.content = content;
}
@Override
@@ -340,7 +356,7 @@ public class MimetypeMapTest extends TestCase
@Override
public InputStream getContentInputStream() throws ContentIOException
{
return new ByteArrayInputStream("<X>@@/Y".getBytes(StandardCharsets.UTF_8));
return new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8));
}
@Override