diff --git a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java index 5988b4e7..230e2454 100644 --- a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java +++ b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java @@ -48,6 +48,7 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import java.util.stream.Stream; import static org.alfresco.transformer.util.RequestParamMap.PAGE_LIMIT; import static org.alfresco.transformer.util.RequestParamMap.TRANSFORM_NAME_PARAMETER; @@ -97,7 +98,7 @@ public class AIOTransformRegistryTest "Archive", "OutlookMsg", "PdfBox", "Office", "Poi", "OOXML", "TikaAuto", "TextMining"); List expectedTransformOptionNames = Arrays.asList("tikaOptions", "archiveOptions", "pdfboxOptions", - "textToPdfOptions", "stringOptions"); + "textToPdfOptions", "stringOptions", "metadataOptions"); TransformConfig miscConfig = loadConfig("misc_engine_config.json"); TransformConfig tikaConfig = loadConfig("tika_engine_config.json"); @@ -116,8 +117,11 @@ public class AIOTransformRegistryTest } // check correct number of options + long distinctOptionCount = Stream.concat( + miscConfig.getTransformOptions().keySet().stream(), + tikaConfig.getTransformOptions().keySet().stream()).distinct().count(); assertEquals("Number of expected transformers", - miscConfig.getTransformOptions().size() + tikaConfig.getTransformOptions().size(), + distinctOptionCount, aioTransformerRegistry.getTransformConfig().getTransformOptions().size()); Set actualOptionNames = aioTransformerRegistry.getTransformConfig().getTransformOptions().keySet(); @@ -125,7 +129,7 @@ public class AIOTransformRegistryTest // check all options are there for (String optionName : expectedTransformOptionNames) { - assertTrue("Expected transform option missing.", actualOptionNames.contains(optionName)); + assertTrue("Expected transform option missing:"+optionName, actualOptionNames.contains(optionName)); } } diff --git a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/misc_engine_config.json b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/misc_engine_config.json index 60f15114..37fbafe4 100644 --- a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/misc_engine_config.json +++ b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/misc_engine_config.json @@ -5,6 +5,9 @@ ], "stringOptions": [ {"value": {"name": "targetEncoding"}} + ], + "metadataOptions": [ + {"value": {"name": "extractMapping"}} ] }, "transformers": [ @@ -77,6 +80,7 @@ {"sourceMediaType": "application/xhtml+xml", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -85,6 +89,7 @@ {"sourceMediaType": "message/rfc822", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] } ] diff --git a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/tika_engine_config.json b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/tika_engine_config.json index 451cc2ab..39f4a8c1 100644 --- a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/tika_engine_config.json +++ b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/tika_engine_config.json @@ -10,6 +10,9 @@ "pdfboxOptions": [ {"value": {"name": "notExtractBookmarksText"}}, {"value": {"name": "targetEncoding"}} + ], + "metadataOptions": [ + {"value": {"name": "extractMapping"}} ] }, "transformers": [ @@ -520,6 +523,7 @@ {"sourceMediaType": "image/x-dwg", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -528,6 +532,7 @@ {"sourceMediaType": "application/vnd.ms-outlook", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -536,6 +541,7 @@ {"sourceMediaType": "audio/mpeg", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -556,6 +562,7 @@ {"sourceMediaType": "application/x-tika-ooxml-protected", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -601,6 +608,7 @@ {"sourceMediaType": "application/x-vnd.oasis.opendocument.graphics", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -610,6 +618,7 @@ {"sourceMediaType": "application/illustrator", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -645,6 +654,7 @@ {"sourceMediaType": "application/vnd.ms-excel.template.macroenabled.12", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -662,6 +672,7 @@ {"sourceMediaType": "video/mp4", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -968,6 +979,7 @@ {"sourceMediaType": "image/hfa", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] } ] diff --git a/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java b/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java index 4a1f1313..cdfc2721 100644 --- a/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java +++ b/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java @@ -2,7 +2,7 @@ * #%L * Alfresco Transform Core * %% - * Copyright (C) 2005 - 2019 Alfresco Software Limited + * Copyright (C) 2005 - 2020 Alfresco Software Limited * %% * This file is part of the Alfresco software. * - @@ -150,6 +150,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest MIMETYPE_TEXT_PLAIN, null, null, + null, readTestFile("eml")); assertTrue("Content from eml transform didn't contain expected value. ", result.getResponse().getContentAsString().contains(expected)); @@ -169,7 +170,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest MIMETYPE_TEXT_PLAIN, null, null, - readTestFile("spanish.eml")); + null, readTestFile("spanish.eml")); String contentResult = new String(result.getResponse().getContentAsByteArray(), UTF_8); assertTrue("Content from eml transform didn't contain expected value. ", @@ -191,6 +192,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest MIMETYPE_TEXT_PLAIN, null, null, + null, readTestFile("attachment.eml")); assertTrue("Content from eml transform didn't contain expected value. ", result.getResponse().getContentAsString().contains(expected)); @@ -211,6 +213,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest MIMETYPE_TEXT_PLAIN, null, null, + null, readTestFile("alternative.eml")); assertTrue("Content from eml transform didn't contain expected value. ", result.getResponse().getContentAsString().contains(expected)); @@ -230,11 +233,77 @@ public class MiscControllerTest extends AbstractTransformerControllerTest MIMETYPE_TEXT_PLAIN, null, null, + null, readTestFile("nested.alternative.eml")); assertTrue("Content from eml transform didn't contain expected value. ", result.getResponse().getContentAsString().contains(expected)); } + /** + * Test extracting default metadata from a valid eml file + */ + @Test + public void testExtractMetadataRFC822() throws Exception + { + String expected = + "{\"{http://www.alfresco.org/model/content/1.0}addressee\":\"Nevin Nollop \"," + + "\"{http://www.alfresco.org/model/content/1.0}description\":\"The quick brown fox jumps over the lazy dog\"," + + "\"{http://www.alfresco.org/model/content/1.0}addressees\":\"Nevin Nollop \"," + + "\"{http://www.alfresco.org/model/imap/1.0}dateSent\":1086351802000," + + "\"{http://www.alfresco.org/model/imap/1.0}messageTo\":\"Nevin Nollop \"," + + "\"{http://www.alfresco.org/model/imap/1.0}messageId\":\"<20040604122322.GV1905@phoenix.home>\"," + + "\"{http://www.alfresco.org/model/content/1.0}title\":\"The quick brown fox jumps over the lazy dog\"," + + "\"{http://www.alfresco.org/model/imap/1.0}messageSubject\":\"The quick brown fox jumps over the lazy dog\"," + + "\"{http://www.alfresco.org/model/imap/1.0}messageCc\":\"Nevin Nollop \"," + + "\"{http://www.alfresco.org/model/content/1.0}sentdate\":1086351802000," + + "\"{http://www.alfresco.org/model/content/1.0}subjectline\":\"The quick brown fox jumps over the lazy dog\"," + + "\"{http://www.alfresco.org/model/imap/1.0}messageFrom\":\"Nevin Nollop \"," + + "\"{http://www.alfresco.org/model/content/1.0}originator\":\"Nevin Nollop \"}"; + MvcResult result = sendRequest("eml", + null, + MIMETYPE_RFC822, + "json", + "alfresco-metadata-extract", + null, + null, + null, + readTestFile("eml")); + String metadata = result.getResponse().getContentAsString(); + assertEquals("Metadata extract", expected, metadata); + } + + /** + * Test extracting metadata specified in an option from a valid eml file + */ + @Test + public void testExtractMetadataOptionRFC822() throws Exception + { + // {"messageSubject":["{http://www.alfresco.org/model/imap/1.0}messageSubject","{http://www.alfresco.org/model/content/1.0}subjectline","{http://www.alfresco.org/model/content/1.0}description","{http://www.alfresco.org/model/content/1.0}title"],"Thread-Index":["{http://www.alfresco.org/model/imap/1.0}threadIndex"],"messageTo":["{http://www.alfresco.org/model/imap/1.0}messageTo","{http://www.alfresco.org/model/content/1.0}addressee"],"messageSent":["{http://www.alfresco.org/model/content/1.0}sentdate","{http://www.alfresco.org/model/imap/1.0}dateSent"],"Message-ID":["{http://www.alfresco.org/model/imap/1.0}messageId"],"messageCc":["{http://www.alfresco.org/model/imap/1.0}messageCc","{http://www.alfresco.org/model/content/1.0}addressees"],"messageReceived":["{http://www.alfresco.org/model/imap/1.0}dateReceived"],"messageFrom":["{http://www.alfresco.org/model/imap/1.0}messageFrom","{http://www.alfresco.org/model/content/1.0}originator"]} + String extractMapping = + "{\"messageSubject\":[" + + "\"{http://www.alfresco.org/model/imap/1.0}messageSubject\"," + + "\"{http://www.alfresco.org/model/content/1.0}title\"]," + + "\"Thread-Index\":[" + + "\"{http://www.alfresco.org/model/imap/1.0}threadIndex\"]," + + "\"messageFrom\":[" + + "\"{http://www.alfresco.org/model/dod5015/1.0}dodProp1\"]}\n"; + String expected = + "{\"{http://www.alfresco.org/model/imap/1.0}messageSubject\":\"The quick brown fox jumps over the lazy dog\"," + + "\"{http://www.alfresco.org/model/dod5015/1.0}dodProp1\":\"Nevin Nollop \"," + + "\"{http://www.alfresco.org/model/content/1.0}title\":\"The quick brown fox jumps over the lazy dog\"}"; + MvcResult result = sendRequest("eml", + null, + MIMETYPE_RFC822, + "json", + "alfresco-metadata-extract", + null, + null, + extractMapping, + readTestFile("eml")); + String metadata = result.getResponse().getContentAsString(); + assertEquals("Option metadata extract", expected, metadata); + } + /** * Test transforming a valid eml with a html part containing html special characters to text */ @@ -249,6 +318,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest MIMETYPE_TEXT_PLAIN, null, null, + null, readTestFile("htmlChars.eml")); assertFalse(result.getResponse().getContentAsString().contains(expected)); } @@ -275,6 +345,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest MIMETYPE_TEXT_PLAIN, null, null, + null, expected.getBytes()); String contentResult = new String(result.getResponse().getContentAsByteArray(), @@ -304,6 +375,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest MIMETYPE_TEXT_PLAIN, "UTF-8", null, + null, content); String contentResult = new String(result.getResponse().getContentAsByteArray(), @@ -324,6 +396,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest MIMETYPE_TEXT_PLAIN, "UTF-8", null, + null, content); assertEquals("Returned content should be empty for an empty source file", 0, @@ -349,6 +422,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest MIMETYPE_PDF, null, "1", + null, expected.getBytes()); // Read back in the PDF and check it @@ -368,7 +442,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest public void testAppleIWorksPages() throws Exception { MvcResult result = sendRequest("numbers", null, MIMETYPE_IWORK_NUMBERS, - "jpeg", MIMETYPE_IMAGE_JPEG, null, null, readTestFile("pages")); + "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("pages")); assertTrue("Expected image content but content is empty.", result.getResponse().getContentLengthLong() > 0L); } @@ -377,7 +451,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest public void testAppleIWorksNumbers() throws Exception { MvcResult result = sendRequest("numbers", null, MIMETYPE_IWORK_NUMBERS, - "jpeg", MIMETYPE_IMAGE_JPEG, null, null, readTestFile("numbers")); + "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("numbers")); assertTrue("Expected image content but content is empty.", result.getResponse().getContentLengthLong() > 0L); } @@ -386,7 +460,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest public void testAppleIWorksKey() throws Exception { MvcResult result = sendRequest("key", null, MIMETYPE_IWORK_KEYNOTE, - "jpeg", MIMETYPE_IMAGE_JPEG, null, null, readTestFile("key")); + "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("key")); assertTrue("Expected image content but content is empty.", result.getResponse().getContentLengthLong() > 0L); } @@ -396,7 +470,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest public void testOOXML() throws Exception { MvcResult result = sendRequest("docx", null, MIMETYPE_OPENXML_WORDPROCESSING, - "jpeg", MIMETYPE_IMAGE_JPEG, null, null, readTestFile("docx")); + "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("docx")); assertTrue("Expected image content but content is empty.", result.getResponse().getContentLengthLong() > 0L); } @@ -408,6 +482,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest String targetMimetype, String targetEncoding, String pageLimit, + String extractMapping, byte[] content) throws Exception { final MockMultipartFile sourceFile = new MockMultipartFile("file", @@ -433,6 +508,10 @@ public class MiscControllerTest extends AbstractTransformerControllerTest { requestBuilder.param("pageLimit", pageLimit); } + if (extractMapping != null) + { + requestBuilder.param("extractMapping", extractMapping); + } return mockMvc.perform(requestBuilder) .andExpect(status().is(OK.value())) diff --git a/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/resources/misc_engine_config.json b/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/resources/misc_engine_config.json index 60f15114..37fbafe4 100644 --- a/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/resources/misc_engine_config.json +++ b/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/resources/misc_engine_config.json @@ -5,6 +5,9 @@ ], "stringOptions": [ {"value": {"name": "targetEncoding"}} + ], + "metadataOptions": [ + {"value": {"name": "extractMapping"}} ] }, "transformers": [ @@ -77,6 +80,7 @@ {"sourceMediaType": "application/xhtml+xml", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -85,6 +89,7 @@ {"sourceMediaType": "message/rfc822", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] } ] diff --git a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/metadataExtractors/HtmlMetadataExtractor.java b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/metadataExtractors/HtmlMetadataExtractor.java index 6d462835..694cbd74 100644 --- a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/metadataExtractors/HtmlMetadataExtractor.java +++ b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/metadataExtractors/HtmlMetadataExtractor.java @@ -78,8 +78,7 @@ public class HtmlMetadataExtractor extends AbstractMetadataExtractor implements public void extractMetadata(String sourceMimetype, String targetMimetype, Map transformOptions, File sourceFile, File targetFile) throws Exception { - Map metadata = extractMetadata(sourceMimetype, transformOptions, sourceFile); - mapMetadataAndWrite(targetFile, metadata); + extractMetadata(sourceMimetype, transformOptions, sourceFile, targetFile); } @Override diff --git a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/metadataExtractors/RFC822MetadataExtractor.java b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/metadataExtractors/RFC822MetadataExtractor.java index de7e61c2..5f646f4e 100644 --- a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/metadataExtractors/RFC822MetadataExtractor.java +++ b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/metadataExtractors/RFC822MetadataExtractor.java @@ -86,8 +86,7 @@ public class RFC822MetadataExtractor extends AbstractMetadataExtractor implement public void extractMetadata(String sourceMimetype, String targetMimetype, Map transformOptions, File sourceFile, File targetFile) throws Exception { - Map metadata = extractMetadata(sourceMimetype, transformOptions, sourceFile); - mapMetadataAndWrite(targetFile, metadata); + extractMetadata(sourceMimetype, transformOptions, sourceFile, targetFile); } @Override diff --git a/alfresco-transform-misc/alfresco-transform-misc/src/main/resources/misc_engine_config.json b/alfresco-transform-misc/alfresco-transform-misc/src/main/resources/misc_engine_config.json index 91f8d2b2..41700f84 100644 --- a/alfresco-transform-misc/alfresco-transform-misc/src/main/resources/misc_engine_config.json +++ b/alfresco-transform-misc/alfresco-transform-misc/src/main/resources/misc_engine_config.json @@ -5,6 +5,9 @@ ], "stringOptions": [ {"value": {"name": "targetEncoding"}} + ], + "metadataOptions": [ + {"value": {"name": "extractMapping"}} ] }, "transformers": [ @@ -77,6 +80,7 @@ {"sourceMediaType": "application/xhtml+xml", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -85,6 +89,7 @@ {"sourceMediaType": "message/rfc822", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] } ] diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/tika_engine_config.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/tika_engine_config.json index 451cc2ab..39f4a8c1 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/tika_engine_config.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/tika_engine_config.json @@ -10,6 +10,9 @@ "pdfboxOptions": [ {"value": {"name": "notExtractBookmarksText"}}, {"value": {"name": "targetEncoding"}} + ], + "metadataOptions": [ + {"value": {"name": "extractMapping"}} ] }, "transformers": [ @@ -520,6 +523,7 @@ {"sourceMediaType": "image/x-dwg", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -528,6 +532,7 @@ {"sourceMediaType": "application/vnd.ms-outlook", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -536,6 +541,7 @@ {"sourceMediaType": "audio/mpeg", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -556,6 +562,7 @@ {"sourceMediaType": "application/x-tika-ooxml-protected", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -601,6 +608,7 @@ {"sourceMediaType": "application/x-vnd.oasis.opendocument.graphics", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -610,6 +618,7 @@ {"sourceMediaType": "application/illustrator", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -645,6 +654,7 @@ {"sourceMediaType": "application/vnd.ms-excel.template.macroenabled.12", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -662,6 +672,7 @@ {"sourceMediaType": "video/mp4", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -968,6 +979,7 @@ {"sourceMediaType": "image/hfa", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] } ] diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java index 4a288f9d..2faf0979 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java @@ -168,8 +168,7 @@ public class TikaJavaExecutor implements JavaExecutor throws Exception { AbstractTikaMetadataExtractor metadataExtractor = this.metadataExtractor.get(transformName); - Map metadata = metadataExtractor.extractMetadata(sourceMimetype, transformOptions, sourceFile); - metadataExtractor.mapMetadataAndWrite(targetFile, metadata); + metadataExtractor.extractMetadata(sourceMimetype, transformOptions, sourceFile, targetFile); } /** diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/tika_engine_config.json b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/tika_engine_config.json index a5deb8f3..6206f6bc 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/tika_engine_config.json +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/tika_engine_config.json @@ -10,6 +10,9 @@ "pdfboxOptions": [ {"value": {"name": "notExtractBookmarksText"}}, {"value": {"name": "targetEncoding"}} + ], + "metadataOptions": [ + {"value": {"name": "extractMapping"}} ] }, "transformers": [ @@ -520,6 +523,7 @@ {"sourceMediaType": "image/x-dwg", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -528,6 +532,7 @@ {"sourceMediaType": "application/vnd.ms-outlook", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -536,6 +541,7 @@ {"sourceMediaType": "audio/mpeg", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -556,6 +562,7 @@ {"sourceMediaType": "application/x-tika-ooxml-protected", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -601,6 +608,7 @@ {"sourceMediaType": "application/x-vnd.oasis.opendocument.graphics", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -610,6 +618,7 @@ {"sourceMediaType": "application/illustrator", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -645,6 +654,7 @@ {"sourceMediaType": "application/vnd.ms-excel.template.macroenabled.12", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -662,6 +672,7 @@ {"sourceMediaType": "video/mp4", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] }, { @@ -968,6 +979,7 @@ {"sourceMediaType": "image/hfa", "targetMediaType": "alfresco-metadata-extract"} ], "transformOptions": [ + "metadataOptions" ] } ] diff --git a/alfresco-transformer-base/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractMetadataExtractor.java b/alfresco-transformer-base/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractMetadataExtractor.java index f1c096ef..c4589303 100644 --- a/alfresco-transformer-base/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractMetadataExtractor.java +++ b/alfresco-transformer-base/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractMetadataExtractor.java @@ -63,7 +63,7 @@ import java.util.StringTokenizer; *
  • The T-Engine's Controller class will call a method in a class that extends {@link AbstractMetadataExtractor} * based on the source and target mediatypes in the normal way.
  • *
  • The method extracts ALL available metadata is extracted from the document and then calls - * {@link #mapMetadataAndWrite(File, Map)}.
  • + * {@link #mapMetadataAndWrite(File, Map, Map)}. *
  • Selected values from the available metadata are mapped into content repository property names and values, * depending on what is defined in a {@code "_metadata_extract.properties"} file.
  • *
  • The selected values are set back to the content repository as a JSON representation of a Map, where the values @@ -95,6 +95,7 @@ public abstract class AbstractMetadataExtractor private static final String EXTRACT = "extract"; private static final String EMBED = "embed"; private static final String METADATA = "metadata"; + private static final String EXTRACT_MAPPING = "extractMapping"; private static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix."; private static final char NAMESPACE_PREFIX = ':'; @@ -110,17 +111,18 @@ public abstract class AbstractMetadataExtractor private static final ObjectMapper jsonObjectMapper = new ObjectMapper(); protected final Logger logger; - private Map> extractMapping; + private Map> defaultExtractMapping; + private ThreadLocal>> extractMapping = new ThreadLocal<>(); private Map> embedMapping; public AbstractMetadataExtractor(Logger logger) { this.logger = logger; - extractMapping = Collections.emptyMap(); + defaultExtractMapping = Collections.emptyMap(); embedMapping = Collections.emptyMap(); try { - extractMapping = buildExtractMapping(); + defaultExtractMapping = buildExtractMapping(); embedMapping = buildEmbedMapping(); } catch (Exception e) @@ -148,7 +150,7 @@ public abstract class AbstractMetadataExtractor try { - TypeReference> typeRef = new TypeReference>() {}; + TypeReference> typeRef = new TypeReference<>() {}; return jsonObjectMapper.readValue(metadataAsJson, typeRef); } catch (JsonProcessingException e) @@ -159,7 +161,7 @@ public abstract class AbstractMetadataExtractor protected Map> getExtractMapping() { - return Collections.unmodifiableMap(extractMapping); + return Collections.unmodifiableMap(extractMapping.get()); } public Map> getEmbedMapping() @@ -432,7 +434,60 @@ public abstract class AbstractMetadataExtractor return true; } + /** + * The {@code transformOptions} may contain a replacement set of mappings. These will be used in place of the + * default mappings from read from file if supplied. + */ + public void extractMetadata(String sourceMimetype, Map transformOptions, File sourceFile, + File targetFile) throws Exception + { + Map> mapping = getExtractMappingFromOptions(transformOptions, defaultExtractMapping); + + // Use a ThreadLocal to avoid changing method signatures of methods that currently call getExtractMapping. + try + { + extractMapping.set(mapping); + Map metadata = extractMetadata(sourceMimetype, transformOptions, sourceFile); + mapMetadataAndWrite(targetFile, metadata, mapping); + + } + finally + { + extractMapping.set(null); + } + } + + private Map> getExtractMappingFromOptions(Map transformOptions, Map> defaultExtractMapping) + { + String extractMappingOption = transformOptions.get(EXTRACT_MAPPING); + if (extractMappingOption != null) + { + try + { + TypeReference>> typeRef = new TypeReference<>() {}; + return jsonObjectMapper.readValue(extractMappingOption, typeRef); + } + catch (JsonProcessingException e) + { + throw new IllegalArgumentException("Failed to read "+ EXTRACT_MAPPING +" from request", e); + } + } + return defaultExtractMapping; + } + + /** + * @deprecated use {@link #extractMetadata(String, Map, File, File)} rather than calling this method. + * By default call the overloaded method with the default {@code extractMapping}. + */ + @Deprecated public void mapMetadataAndWrite(File targetFile, Map metadata) throws IOException + { + mapMetadataAndWrite(targetFile, metadata, defaultExtractMapping); + } + + public void mapMetadataAndWrite(File targetFile, Map metadata, + Map> extractMapping) throws IOException { if (logger.isDebugEnabled()) { @@ -440,17 +495,19 @@ public abstract class AbstractMetadataExtractor metadata.forEach((k,v) -> logger.debug(" "+k+"="+v)); } - metadata = mapRawToSystem(metadata); + metadata = mapRawToSystem(metadata, extractMapping); writeMetadata(targetFile, metadata); } /** * Based on AbstractMappingMetadataExtracter#mapRawToSystem. * - * @param rawMetadata Metadata keyed by document properties - * @return Returns the metadata keyed by the system properties + * @param rawMetadata Metadata keyed by document properties + * @param extractMapping Mapping between document ans system properties + * @return Returns the metadata keyed by the system properties */ - private Map mapRawToSystem(Map rawMetadata) + private Map mapRawToSystem(Map rawMetadata, + Map> extractMapping) { boolean debugEnabled = logger.isDebugEnabled(); if (debugEnabled) diff --git a/pom.xml b/pom.xml index a6265824..bb429e1d 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ org.springframework.boot spring-boot-starter-parent - 2.3.1.RELEASE + 2.3.5.RELEASE @@ -24,9 +24,9 @@ 3.0.1.1 ${project.version} 1.0.2.11 - 5.15.9 + 5.15.13 2.10.3 - 3.3.5 + 3.4.1 1.24.1 4.1.2 1.4