From 59325bc38a9da8c2400544f0a539b69d68c710a5 Mon Sep 17 00:00:00 2001 From: Alan Davis Date: Thu, 13 Jan 2022 17:25:56 +0000 Subject: [PATCH] Repeat Bump dependency.tika.version from 2.1.0 to 2.2.1 (#516) * Repeat Bump dependency.tika.version from 2.1.0 to 2.2.1 Original PR https://github.com/Alfresco/alfresco-transform-core/pull/506 was merged to master where it failed. There had been no build of the PR before the merge, which is why this branch has been created. * Use non deprecated TikaCoreProperties.SUBJECT with tika 2.2.1. The deprecated OfficeOpenXMLCore.SUBJECT value worked in 2.2.0 but not 2.2.1 * With the upgrade of Tika from 2.2.0 to 2.2.1, the deprecated OfficeOpenXMLCore.SUBJECT metadata value became being null and the replacement TikaCoreProperties.SUBJECT became a multi value in a few of our test cases. For backward compatibility with very old versions of Alfresco, we have historically been added a number of extra values including "subject" and "description" back into the raw metadata, before mapping them onto Alfresco properties. These values existed in the original version of Tika used by Alfresco, so it is possible there are custom mappings out there that using them. To complicate matters a little, out standard mappings for some types put the raw "subject" value into cm:description property. What makes it interesting is that the extra "description" value is not used but has the value originally in our expected metadata extarct data. That is why the quick_*_json files have been modified. --- .../transformer/TikaMetadataExtractsIT.java | 24 +++++++++++++++++++ .../test/resources/quick.odf_metadata.json | 2 +- .../test/resources/quick.odg_metadata.json | 2 +- .../test/resources/quick.odt_metadata.json | 2 +- .../test/resources/quick.otg_metadata.json | 2 +- .../test/resources/quick.ott_metadata.json | 2 +- .../test/resources/quick.pdf_metadata.json | 2 +- .../test/resources/quick.sxw_metadata.json | 2 +- .../AbstractTikaMetadataExtractor.java | 18 +++++++------- .../AbstractMetadataExtractsIT.java | 6 ++--- .../alfresco/transformer/TestFileInfo.java | 6 +++++ pom.xml | 2 +- 12 files changed, 50 insertions(+), 20 deletions(-) diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java index b5e35eb2..bcd376d7 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java @@ -548,4 +548,28 @@ public class TikaMetadataExtractsIT extends AbstractMetadataExtractsIT ); } + + @ParameterizedTest + @MethodSource("tika2_2_1_upgradeFailures") + public void testTika_2_2_1_upgradeFailures(TestFileInfo testFileInfo) + { + super.testTransformation(testFileInfo); + } + + private static Stream tika2_2_1_upgradeFailures() + { + // When we upgraded to Tika 2.2.1 from 2.2.0: + // - the original OfficeOpenXMLCore.SUBJECT raw metadata value started being null. + // - the replacement TikaCoreProperties.SUBJECT raw metadata changed into a multi value + // The following test files were the ones that failed. + return Stream.of( + testFile(MIMETYPE_OPENDOCUMENT_GRAPHICS_TEMPLATE, "otg", "quick.otg"), + testFile(MIMETYPE_OPENOFFICE1_WRITER, "sxw", "quick.sxw"), + testFile(MIMETYPE_OPENDOCUMENT_GRAPHICS, "odg", "quick.odg"), + testFile(MIMETYPE_OPENDOCUMENT_TEXT, "odt", "quick.odt"), + testFile(MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE, "ott", "quick.ott"), + testFile(MIMETYPE_OPENDOCUMENT_FORMULA, "odf", "quick.odf"), + testFile(MIMETYPE_PDF, "pdf", "quick.pdf") + ); + } } diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odf_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odf_metadata.json index 64a82c12..e185ac01 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odf_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odf_metadata.json @@ -1,5 +1,5 @@ { - "{http://www.alfresco.org/model/content/1.0}description" : "Gym class featuring a brown fox and lazy dog", + "{http://www.alfresco.org/model/content/1.0}description" : "Pangram, fox, dog, Gym class featuring a brown fox and lazy dog", "{http://www.alfresco.org/model/content/1.0}created" : 1138362922000, "{http://www.alfresco.org/model/content/1.0}author" : null, "{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog" diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odg_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odg_metadata.json index a542951b..c08f6a81 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odg_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odg_metadata.json @@ -1,5 +1,5 @@ { - "{http://www.alfresco.org/model/content/1.0}description" : "Gym class featuring a brown fox and lazy dog", + "{http://www.alfresco.org/model/content/1.0}description" : "Pangram, fox, dog, Gym class featuring a brown fox and lazy dog", "{http://www.alfresco.org/model/content/1.0}created" : 1138362371000, "{http://www.alfresco.org/model/content/1.0}author" : "Derek Hulley", "{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog" diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odt_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odt_metadata.json index 37545ead..18faa8b9 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odt_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.odt_metadata.json @@ -1,5 +1,5 @@ { - "{http://www.alfresco.org/model/content/1.0}description" : "Gym class featuring a brown fox and lazy dog", + "{http://www.alfresco.org/model/content/1.0}description" : "Pangram, fox, dog, Gym class featuring a brown fox and lazy dog", "{http://www.alfresco.org/model/content/1.0}created" : 1126049640000, "{http://www.alfresco.org/model/content/1.0}author" : "Jesper Steen Møller", "{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog" diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.otg_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.otg_metadata.json index a542951b..c08f6a81 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.otg_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.otg_metadata.json @@ -1,5 +1,5 @@ { - "{http://www.alfresco.org/model/content/1.0}description" : "Gym class featuring a brown fox and lazy dog", + "{http://www.alfresco.org/model/content/1.0}description" : "Pangram, fox, dog, Gym class featuring a brown fox and lazy dog", "{http://www.alfresco.org/model/content/1.0}created" : 1138362371000, "{http://www.alfresco.org/model/content/1.0}author" : "Derek Hulley", "{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog" diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.ott_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.ott_metadata.json index 37545ead..18faa8b9 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.ott_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.ott_metadata.json @@ -1,5 +1,5 @@ { - "{http://www.alfresco.org/model/content/1.0}description" : "Gym class featuring a brown fox and lazy dog", + "{http://www.alfresco.org/model/content/1.0}description" : "Pangram, fox, dog, Gym class featuring a brown fox and lazy dog", "{http://www.alfresco.org/model/content/1.0}created" : 1126049640000, "{http://www.alfresco.org/model/content/1.0}author" : "Jesper Steen Møller", "{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog" diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.pdf_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.pdf_metadata.json index 0f46dcb3..8758c90f 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.pdf_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.pdf_metadata.json @@ -1,5 +1,5 @@ { - "{http://www.alfresco.org/model/content/1.0}description" : "Gym class featuring a brown fox and lazy dog", + "{http://www.alfresco.org/model/content/1.0}description" : "Pangram, fox, dog, Gym class featuring a brown fox and lazy dog", "{http://www.alfresco.org/model/content/1.0}created" : "2005-05-26T19:52:58Z", "{http://www.alfresco.org/model/content/1.0}author" : "Nevin Nollop", "{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog" diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.sxw_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.sxw_metadata.json index 37545ead..18faa8b9 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.sxw_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.sxw_metadata.json @@ -1,5 +1,5 @@ { - "{http://www.alfresco.org/model/content/1.0}description" : "Gym class featuring a brown fox and lazy dog", + "{http://www.alfresco.org/model/content/1.0}description" : "Pangram, fox, dog, Gym class featuring a brown fox and lazy dog", "{http://www.alfresco.org/model/content/1.0}created" : 1126049640000, "{http://www.alfresco.org/model/content/1.0}author" : "Jesper Steen Møller", "{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog" diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractTikaMetadataExtractor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractTikaMetadataExtractor.java index c9e8ab60..e52e9394 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractTikaMetadataExtractor.java +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractTikaMetadataExtractor.java @@ -2,7 +2,7 @@ * #%L * Alfresco Transform Core * %% - * Copyright (C) 2005 - 2021 Alfresco Software Limited + * Copyright (C) 2005 - 2022 Alfresco Software Limited * %% * This file is part of the Alfresco software. * - @@ -269,30 +269,30 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr // Get the subject and description, despite things not // being nearly as consistent as one might hope - String subject = getMetadataValue(metadata, OfficeOpenXMLCore.SUBJECT); + String subject = getMetadataValue(metadata, TikaCoreProperties.SUBJECT); String description = getMetadataValue(metadata, TikaCoreProperties.DESCRIPTION); - if(subject != null && description != null) + if (subject != null && description != null) { putRawValue(KEY_DESCRIPTION, description, rawProperties); putRawValue(KEY_SUBJECT, subject, rawProperties); } - else if(subject != null) + else if (subject != null) { putRawValue(KEY_DESCRIPTION, subject, rawProperties); putRawValue(KEY_SUBJECT, subject, rawProperties); } - else if(description != null) + else if (description != null) { putRawValue(KEY_DESCRIPTION, description, rawProperties); putRawValue(KEY_SUBJECT, description, rawProperties); } // Try for the dates two different ways too - if(metadata.get(TikaCoreProperties.CREATED) != null) + if (metadata.get(TikaCoreProperties.CREATED) != null) { putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.CREATED), rawProperties); } - else if(metadata.get(TikaCoreProperties.MODIFIED) != null) + else if (metadata.get(TikaCoreProperties.MODIFIED) != null) { putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.MODIFIED), rawProperties); } @@ -458,7 +458,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr public void characters(char[] ch, int start, int len) { - if(text != null) + if (text != null) { text.append(ch, start, len); } @@ -466,7 +466,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr public void endElement(String namespace, String localname, String qname) { - if(text != null && text.length() > 0) + if (text != null && text.length() > 0) { tags.put(qname, text.toString()); } diff --git a/alfresco-transformer-base/src/test/java/org/alfresco/transformer/AbstractMetadataExtractsIT.java b/alfresco-transformer-base/src/test/java/org/alfresco/transformer/AbstractMetadataExtractsIT.java index f6cd7069..97addbaa 100644 --- a/alfresco-transformer-base/src/test/java/org/alfresco/transformer/AbstractMetadataExtractsIT.java +++ b/alfresco-transformer-base/src/test/java/org/alfresco/transformer/AbstractMetadataExtractsIT.java @@ -2,7 +2,7 @@ * #%L * Alfresco Transform Core * %% - * Copyright (C) 2005 - 2021 Alfresco Software Limited + * Copyright (C) 2005 - 2022 Alfresco Software Limited * %% * This file is part of the Alfresco software. * - @@ -102,8 +102,8 @@ public abstract class AbstractMetadataExtractsIT jsonObjectMapper.writerWithDefaultPrettyPrinter().writeValue(actualMetadataFile, actualMetadata); Map expectedMetadata = readExpectedMetadata(metadataFilename, actualMetadataFile); - assertEquals(expectedMetadata, actualMetadata, - "The metadata did not match the expected value. It has been saved in "+actualMetadataFile.getAbsolutePath()); + assertEquals(expectedMetadata, actualMetadata, + sourceFile+": The metadata did not match the expected value. It has been saved in "+actualMetadataFile.getAbsolutePath()); actualMetadataFile.delete(); } catch (Exception e) diff --git a/alfresco-transformer-base/src/test/java/org/alfresco/transformer/TestFileInfo.java b/alfresco-transformer-base/src/test/java/org/alfresco/transformer/TestFileInfo.java index 331213a9..15b8fb70 100644 --- a/alfresco-transformer-base/src/test/java/org/alfresco/transformer/TestFileInfo.java +++ b/alfresco-transformer-base/src/test/java/org/alfresco/transformer/TestFileInfo.java @@ -76,4 +76,10 @@ public class TestFileInfo { return new TestFileInfo(mimeType, extension, path, false); } + + @Override + public String toString() + { + return path; + } } diff --git a/pom.xml b/pom.xml index 14deeb10..75d34617 100644 --- a/pom.xml +++ b/pom.xml @@ -27,7 +27,7 @@ ${dependency.jackson.version} 4.13.2 3.5.0 - 2.1.0 + 2.2.1 4.1.2 1.4