mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-10-15 15:02:20 +00:00
Merged 5.1.N (5.1.2) to 5.2.N (5.2.1)
125892 adragoi: Merged 5.0.N (5.0.4) to 5.1.N (5.1.2) 125842 rmunteanu: Merged V4.2-BUG-FIX (4.2.7) to 5.0.N (5.0.4) (PARTIAL MERGE) 125700 adavis: Merged V4.2.5 (4.2.5.7) to V4.2-BUG-FIX (4.2.7) 125698: Merged DEV to V4.2.5 (4.2.5.7) 125677 arebegea: MNT-15219 : Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may cause OutOfMemory in Tika - Should not have updated version.properties as the original commit needs to be merged forwards., 125696: Merged DEV to V4.2.5 (4.2.5.7) 125677 arebegea: MNT-15219 : Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may cause OutOfMemory in Tika - Modified tika parser and tika core jars to allow some configuration parameters to be sent from Alfresco side using the metadata map parameter - Excluded by default the parsing of drawings/shapes xmls because there was little valuable data that could be extracted from those xmls git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@126004 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -78,6 +78,8 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
|
||||
extracter = new TikaAutoMetadataExtracter(config);
|
||||
extracter.setDictionaryService(dictionaryService);
|
||||
MetadataExtracterConfig metadataExtracterConfig = (MetadataExtracterConfig)ctx.getBean("metadataExtracterConfig");
|
||||
extracter.setMetadataExtracterConfig(metadataExtracterConfig);
|
||||
extracter.register();
|
||||
|
||||
// Attach some extra mappings, using the Tika
|
||||
@@ -171,7 +173,62 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
testFileSpecificMetadata(mimetype, properties);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test MNT-15219 Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may
|
||||
* cause OutOfMemory in Tika Note - doesn't use extractFromMimetype
|
||||
*/
|
||||
public void testParsingOfShapesInXLSXFiles() throws Exception
|
||||
{
|
||||
AutoDetectParser ap = new AutoDetectParser();
|
||||
|
||||
String filename = "dmsu1332-reproduced.xlsx";
|
||||
URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
|
||||
File file = new File(url.getFile());
|
||||
|
||||
// Cheat and ask Tika for the mime type!
|
||||
Metadata metadata = new Metadata();
|
||||
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
|
||||
MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata);
|
||||
String mimetype = mt.toString();
|
||||
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename);
|
||||
}
|
||||
|
||||
// Have it processed
|
||||
// Note that if the patched/fix from MNT-15219 is not applied,
|
||||
// or if the default false value of the content.metadataExtracter.parseShapes property is overridden
|
||||
// then the next call will throw an OutOfMemory that is dealt with by the tika metadata extracter framework
|
||||
// and it will fail at the next assert because properties extracted will be empty
|
||||
Map<QName, Serializable> properties = extractFromFile(file, mimetype);
|
||||
|
||||
// check we got something
|
||||
assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype + " - " + filename,
|
||||
properties.isEmpty());
|
||||
|
||||
if (properties.containsKey(ContentModel.PROP_AUTHOR))
|
||||
{
|
||||
assertEquals("Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
|
||||
"Udintsev, Anton (external - Project)",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
|
||||
}
|
||||
else
|
||||
{
|
||||
fail("Expected one property out of " + ContentModel.PROP_CREATOR + " and " + ContentModel.PROP_AUTHOR + " but found neither of them for "
|
||||
+ mimetype);
|
||||
}
|
||||
|
||||
// Ensure that we can also get things which are standard
|
||||
// Tika metadata properties, if we so choose to
|
||||
assertTrue("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY));
|
||||
assertEquals("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
mimetype,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean skipAuthorCheck(String mimetype) { return true; }
|
||||
|
||||
|
Reference in New Issue
Block a user