Merged 5.1.N (5.1.2) to 5.2.N (5.2.1)

125892 adragoi: Merged 5.0.N (5.0.4) to 5.1.N (5.1.2)
      125842 rmunteanu: Merged V4.2-BUG-FIX (4.2.7) to 5.0.N (5.0.4) (PARTIAL MERGE)
         125700 adavis: Merged V4.2.5 (4.2.5.7) to V4.2-BUG-FIX (4.2.7)
            125698: Merged DEV to V4.2.5 (4.2.5.7)
               125677 arebegea: MNT-15219 : Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may cause OutOfMemory in Tika
                  - Should not have updated version.properties as the original commit needs to be merged forwards.,
            125696: Merged DEV to V4.2.5 (4.2.5.7)
               125677 arebegea: MNT-15219 : Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may cause OutOfMemory in Tika
                  - Modified tika parser and tika core jars to allow some configuration parameters to be sent from Alfresco side using the metadata map parameter
                  - Excluded by default the parsing of drawings/shapes xmls because there was little valuable data that could be extracted from those xmls


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@126004 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Alexandra Leahu
2016-04-29 11:36:11 +00:00
parent 72b275c451
commit fc20674988
13 changed files with 254 additions and 9 deletions

View File

@@ -78,6 +78,8 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
extracter = new TikaAutoMetadataExtracter(config);
extracter.setDictionaryService(dictionaryService);
MetadataExtracterConfig metadataExtracterConfig = (MetadataExtracterConfig)ctx.getBean("metadataExtracterConfig");
extracter.setMetadataExtracterConfig(metadataExtracterConfig);
extracter.register();
// Attach some extra mappings, using the Tika
@@ -171,7 +173,62 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
testFileSpecificMetadata(mimetype, properties);
}
}
/**
* Test MNT-15219 Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may
* cause OutOfMemory in Tika Note - doesn't use extractFromMimetype
*/
public void testParsingOfShapesInXLSXFiles() throws Exception
{
AutoDetectParser ap = new AutoDetectParser();
String filename = "dmsu1332-reproduced.xlsx";
URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
File file = new File(url.getFile());
// Cheat and ask Tika for the mime type!
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata);
String mimetype = mt.toString();
if (logger.isDebugEnabled())
{
logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename);
}
// Have it processed
// Note that if the patched/fix from MNT-15219 is not applied,
// or if the default false value of the content.metadataExtracter.parseShapes property is overridden
// then the next call will throw an OutOfMemory that is dealt with by the tika metadata extracter framework
// and it will fail at the next assert because properties extracted will be empty
Map<QName, Serializable> properties = extractFromFile(file, mimetype);
// check we got something
assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype + " - " + filename,
properties.isEmpty());
if (properties.containsKey(ContentModel.PROP_AUTHOR))
{
assertEquals("Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
"Udintsev, Anton (external - Project)",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
}
else
{
fail("Expected one property out of " + ContentModel.PROP_CREATOR + " and " + ContentModel.PROP_AUTHOR + " but found neither of them for "
+ mimetype);
}
// Ensure that we can also get things which are standard
// Tika metadata properties, if we so choose to
assertTrue("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype,
properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY));
assertEquals("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
mimetype,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
}
@Override
protected boolean skipAuthorCheck(String mimetype) { return true; }