ATS-829 Release T-Engines 2.3.6 (#307)

ATS-829: Release T-Core (T-Engines) 2.3.6 [trigger release]

Linked to REPO-5219 Allow AGS AMP to specify metadata extract mapping

Added an extractMapping transform option to all metadata extractors to override the default one.

3rd party libraries to get a green build.
* Upgrade cxf-rt-transports-http and woodstox-core to avoid issues
* Upgrade to org.springframework.boot:spring-boot-starter-parent:2.3.5.RELEASE to avoid problem in org.springframework:spring-web
* Upgrade to activemq 5.15.13 to avoid problem in activemq-broker 5.15.12
This commit is contained in:
Alan Davis 2020-11-19 18:35:22 +00:00 committed by GitHub
parent 3ef6a7a788
commit 00fbb6405a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 216 additions and 28 deletions

View File

@ -48,6 +48,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.alfresco.transformer.util.RequestParamMap.PAGE_LIMIT; import static org.alfresco.transformer.util.RequestParamMap.PAGE_LIMIT;
import static org.alfresco.transformer.util.RequestParamMap.TRANSFORM_NAME_PARAMETER; import static org.alfresco.transformer.util.RequestParamMap.TRANSFORM_NAME_PARAMETER;
@ -97,7 +98,7 @@ public class AIOTransformRegistryTest
"Archive", "OutlookMsg", "PdfBox", "Office", "Poi", "OOXML", "TikaAuto", "TextMining"); "Archive", "OutlookMsg", "PdfBox", "Office", "Poi", "OOXML", "TikaAuto", "TextMining");
List<String> expectedTransformOptionNames = Arrays.asList("tikaOptions", "archiveOptions", "pdfboxOptions", List<String> expectedTransformOptionNames = Arrays.asList("tikaOptions", "archiveOptions", "pdfboxOptions",
"textToPdfOptions", "stringOptions"); "textToPdfOptions", "stringOptions", "metadataOptions");
TransformConfig miscConfig = loadConfig("misc_engine_config.json"); TransformConfig miscConfig = loadConfig("misc_engine_config.json");
TransformConfig tikaConfig = loadConfig("tika_engine_config.json"); TransformConfig tikaConfig = loadConfig("tika_engine_config.json");
@ -116,8 +117,11 @@ public class AIOTransformRegistryTest
} }
// check correct number of options // check correct number of options
long distinctOptionCount = Stream.concat(
miscConfig.getTransformOptions().keySet().stream(),
tikaConfig.getTransformOptions().keySet().stream()).distinct().count();
assertEquals("Number of expected transformers", assertEquals("Number of expected transformers",
miscConfig.getTransformOptions().size() + tikaConfig.getTransformOptions().size(), distinctOptionCount,
aioTransformerRegistry.getTransformConfig().getTransformOptions().size()); aioTransformerRegistry.getTransformConfig().getTransformOptions().size());
Set<String> actualOptionNames = aioTransformerRegistry.getTransformConfig().getTransformOptions().keySet(); Set<String> actualOptionNames = aioTransformerRegistry.getTransformConfig().getTransformOptions().keySet();
@ -125,7 +129,7 @@ public class AIOTransformRegistryTest
// check all options are there // check all options are there
for (String optionName : expectedTransformOptionNames) for (String optionName : expectedTransformOptionNames)
{ {
assertTrue("Expected transform option missing.", actualOptionNames.contains(optionName)); assertTrue("Expected transform option missing:"+optionName, actualOptionNames.contains(optionName));
} }
} }

View File

@ -5,6 +5,9 @@
], ],
"stringOptions": [ "stringOptions": [
{"value": {"name": "targetEncoding"}} {"value": {"name": "targetEncoding"}}
],
"metadataOptions": [
{"value": {"name": "extractMapping"}}
] ]
}, },
"transformers": [ "transformers": [
@ -77,6 +80,7 @@
{"sourceMediaType": "application/xhtml+xml", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/xhtml+xml", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -85,6 +89,7 @@
{"sourceMediaType": "message/rfc822", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "message/rfc822", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
} }
] ]

View File

@ -10,6 +10,9 @@
"pdfboxOptions": [ "pdfboxOptions": [
{"value": {"name": "notExtractBookmarksText"}}, {"value": {"name": "notExtractBookmarksText"}},
{"value": {"name": "targetEncoding"}} {"value": {"name": "targetEncoding"}}
],
"metadataOptions": [
{"value": {"name": "extractMapping"}}
] ]
}, },
"transformers": [ "transformers": [
@ -520,6 +523,7 @@
{"sourceMediaType": "image/x-dwg", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "image/x-dwg", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -528,6 +532,7 @@
{"sourceMediaType": "application/vnd.ms-outlook", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/vnd.ms-outlook", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -536,6 +541,7 @@
{"sourceMediaType": "audio/mpeg", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "audio/mpeg", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -556,6 +562,7 @@
{"sourceMediaType": "application/x-tika-ooxml-protected", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/x-tika-ooxml-protected", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -601,6 +608,7 @@
{"sourceMediaType": "application/x-vnd.oasis.opendocument.graphics", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/x-vnd.oasis.opendocument.graphics", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -610,6 +618,7 @@
{"sourceMediaType": "application/illustrator", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/illustrator", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -645,6 +654,7 @@
{"sourceMediaType": "application/vnd.ms-excel.template.macroenabled.12", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/vnd.ms-excel.template.macroenabled.12", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -662,6 +672,7 @@
{"sourceMediaType": "video/mp4", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "video/mp4", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -968,6 +979,7 @@
{"sourceMediaType": "image/hfa", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "image/hfa", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
} }
] ]

View File

@ -2,7 +2,7 @@
* #%L * #%L
* Alfresco Transform Core * Alfresco Transform Core
* %% * %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited * Copyright (C) 2005 - 2020 Alfresco Software Limited
* %% * %%
* This file is part of the Alfresco software. * This file is part of the Alfresco software.
* - * -
@ -150,6 +150,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
MIMETYPE_TEXT_PLAIN, MIMETYPE_TEXT_PLAIN,
null, null,
null, null,
null,
readTestFile("eml")); readTestFile("eml"));
assertTrue("Content from eml transform didn't contain expected value. ", assertTrue("Content from eml transform didn't contain expected value. ",
result.getResponse().getContentAsString().contains(expected)); result.getResponse().getContentAsString().contains(expected));
@ -169,7 +170,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
MIMETYPE_TEXT_PLAIN, MIMETYPE_TEXT_PLAIN,
null, null,
null, null,
readTestFile("spanish.eml")); null, readTestFile("spanish.eml"));
String contentResult = new String(result.getResponse().getContentAsByteArray(), UTF_8); String contentResult = new String(result.getResponse().getContentAsByteArray(), UTF_8);
assertTrue("Content from eml transform didn't contain expected value. ", assertTrue("Content from eml transform didn't contain expected value. ",
@ -191,6 +192,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
MIMETYPE_TEXT_PLAIN, MIMETYPE_TEXT_PLAIN,
null, null,
null, null,
null,
readTestFile("attachment.eml")); readTestFile("attachment.eml"));
assertTrue("Content from eml transform didn't contain expected value. ", assertTrue("Content from eml transform didn't contain expected value. ",
result.getResponse().getContentAsString().contains(expected)); result.getResponse().getContentAsString().contains(expected));
@ -211,6 +213,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
MIMETYPE_TEXT_PLAIN, MIMETYPE_TEXT_PLAIN,
null, null,
null, null,
null,
readTestFile("alternative.eml")); readTestFile("alternative.eml"));
assertTrue("Content from eml transform didn't contain expected value. ", assertTrue("Content from eml transform didn't contain expected value. ",
result.getResponse().getContentAsString().contains(expected)); result.getResponse().getContentAsString().contains(expected));
@ -230,11 +233,77 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
MIMETYPE_TEXT_PLAIN, MIMETYPE_TEXT_PLAIN,
null, null,
null, null,
null,
readTestFile("nested.alternative.eml")); readTestFile("nested.alternative.eml"));
assertTrue("Content from eml transform didn't contain expected value. ", assertTrue("Content from eml transform didn't contain expected value. ",
result.getResponse().getContentAsString().contains(expected)); result.getResponse().getContentAsString().contains(expected));
} }
/**
* Test extracting default metadata from a valid eml file
*/
@Test
public void testExtractMetadataRFC822() throws Exception
{
String expected =
"{\"{http://www.alfresco.org/model/content/1.0}addressee\":\"Nevin Nollop <nevin.nollop@gmail.com>\"," +
"\"{http://www.alfresco.org/model/content/1.0}description\":\"The quick brown fox jumps over the lazy dog\"," +
"\"{http://www.alfresco.org/model/content/1.0}addressees\":\"Nevin Nollop <nevinn@alfresco.com>\"," +
"\"{http://www.alfresco.org/model/imap/1.0}dateSent\":1086351802000," +
"\"{http://www.alfresco.org/model/imap/1.0}messageTo\":\"Nevin Nollop <nevin.nollop@gmail.com>\"," +
"\"{http://www.alfresco.org/model/imap/1.0}messageId\":\"<20040604122322.GV1905@phoenix.home>\"," +
"\"{http://www.alfresco.org/model/content/1.0}title\":\"The quick brown fox jumps over the lazy dog\"," +
"\"{http://www.alfresco.org/model/imap/1.0}messageSubject\":\"The quick brown fox jumps over the lazy dog\"," +
"\"{http://www.alfresco.org/model/imap/1.0}messageCc\":\"Nevin Nollop <nevinn@alfresco.com>\"," +
"\"{http://www.alfresco.org/model/content/1.0}sentdate\":1086351802000," +
"\"{http://www.alfresco.org/model/content/1.0}subjectline\":\"The quick brown fox jumps over the lazy dog\"," +
"\"{http://www.alfresco.org/model/imap/1.0}messageFrom\":\"Nevin Nollop <nevin.nollop@alfresco.com>\"," +
"\"{http://www.alfresco.org/model/content/1.0}originator\":\"Nevin Nollop <nevin.nollop@alfresco.com>\"}";
MvcResult result = sendRequest("eml",
null,
MIMETYPE_RFC822,
"json",
"alfresco-metadata-extract",
null,
null,
null,
readTestFile("eml"));
String metadata = result.getResponse().getContentAsString();
assertEquals("Metadata extract", expected, metadata);
}
/**
* Test extracting metadata specified in an option from a valid eml file
*/
@Test
public void testExtractMetadataOptionRFC822() throws Exception
{
// {"messageSubject":["{http://www.alfresco.org/model/imap/1.0}messageSubject","{http://www.alfresco.org/model/content/1.0}subjectline","{http://www.alfresco.org/model/content/1.0}description","{http://www.alfresco.org/model/content/1.0}title"],"Thread-Index":["{http://www.alfresco.org/model/imap/1.0}threadIndex"],"messageTo":["{http://www.alfresco.org/model/imap/1.0}messageTo","{http://www.alfresco.org/model/content/1.0}addressee"],"messageSent":["{http://www.alfresco.org/model/content/1.0}sentdate","{http://www.alfresco.org/model/imap/1.0}dateSent"],"Message-ID":["{http://www.alfresco.org/model/imap/1.0}messageId"],"messageCc":["{http://www.alfresco.org/model/imap/1.0}messageCc","{http://www.alfresco.org/model/content/1.0}addressees"],"messageReceived":["{http://www.alfresco.org/model/imap/1.0}dateReceived"],"messageFrom":["{http://www.alfresco.org/model/imap/1.0}messageFrom","{http://www.alfresco.org/model/content/1.0}originator"]}
String extractMapping =
"{\"messageSubject\":[" +
"\"{http://www.alfresco.org/model/imap/1.0}messageSubject\"," +
"\"{http://www.alfresco.org/model/content/1.0}title\"]," +
"\"Thread-Index\":[" +
"\"{http://www.alfresco.org/model/imap/1.0}threadIndex\"]," +
"\"messageFrom\":[" +
"\"{http://www.alfresco.org/model/dod5015/1.0}dodProp1\"]}\n";
String expected =
"{\"{http://www.alfresco.org/model/imap/1.0}messageSubject\":\"The quick brown fox jumps over the lazy dog\"," +
"\"{http://www.alfresco.org/model/dod5015/1.0}dodProp1\":\"Nevin Nollop <nevin.nollop@alfresco.com>\"," +
"\"{http://www.alfresco.org/model/content/1.0}title\":\"The quick brown fox jumps over the lazy dog\"}";
MvcResult result = sendRequest("eml",
null,
MIMETYPE_RFC822,
"json",
"alfresco-metadata-extract",
null,
null,
extractMapping,
readTestFile("eml"));
String metadata = result.getResponse().getContentAsString();
assertEquals("Option metadata extract", expected, metadata);
}
/** /**
* Test transforming a valid eml with a html part containing html special characters to text * Test transforming a valid eml with a html part containing html special characters to text
*/ */
@ -249,6 +318,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
MIMETYPE_TEXT_PLAIN, MIMETYPE_TEXT_PLAIN,
null, null,
null, null,
null,
readTestFile("htmlChars.eml")); readTestFile("htmlChars.eml"));
assertFalse(result.getResponse().getContentAsString().contains(expected)); assertFalse(result.getResponse().getContentAsString().contains(expected));
} }
@ -275,6 +345,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
MIMETYPE_TEXT_PLAIN, MIMETYPE_TEXT_PLAIN,
null, null,
null, null,
null,
expected.getBytes()); expected.getBytes());
String contentResult = new String(result.getResponse().getContentAsByteArray(), String contentResult = new String(result.getResponse().getContentAsByteArray(),
@ -304,6 +375,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
MIMETYPE_TEXT_PLAIN, MIMETYPE_TEXT_PLAIN,
"UTF-8", "UTF-8",
null, null,
null,
content); content);
String contentResult = new String(result.getResponse().getContentAsByteArray(), String contentResult = new String(result.getResponse().getContentAsByteArray(),
@ -324,6 +396,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
MIMETYPE_TEXT_PLAIN, MIMETYPE_TEXT_PLAIN,
"UTF-8", "UTF-8",
null, null,
null,
content); content);
assertEquals("Returned content should be empty for an empty source file", 0, assertEquals("Returned content should be empty for an empty source file", 0,
@ -349,6 +422,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
MIMETYPE_PDF, MIMETYPE_PDF,
null, null,
"1", "1",
null,
expected.getBytes()); expected.getBytes());
// Read back in the PDF and check it // Read back in the PDF and check it
@ -368,7 +442,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
public void testAppleIWorksPages() throws Exception public void testAppleIWorksPages() throws Exception
{ {
MvcResult result = sendRequest("numbers", null, MIMETYPE_IWORK_NUMBERS, MvcResult result = sendRequest("numbers", null, MIMETYPE_IWORK_NUMBERS,
"jpeg", MIMETYPE_IMAGE_JPEG, null, null, readTestFile("pages")); "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("pages"));
assertTrue("Expected image content but content is empty.", assertTrue("Expected image content but content is empty.",
result.getResponse().getContentLengthLong() > 0L); result.getResponse().getContentLengthLong() > 0L);
} }
@ -377,7 +451,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
public void testAppleIWorksNumbers() throws Exception public void testAppleIWorksNumbers() throws Exception
{ {
MvcResult result = sendRequest("numbers", null, MIMETYPE_IWORK_NUMBERS, MvcResult result = sendRequest("numbers", null, MIMETYPE_IWORK_NUMBERS,
"jpeg", MIMETYPE_IMAGE_JPEG, null, null, readTestFile("numbers")); "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("numbers"));
assertTrue("Expected image content but content is empty.", assertTrue("Expected image content but content is empty.",
result.getResponse().getContentLengthLong() > 0L); result.getResponse().getContentLengthLong() > 0L);
} }
@ -386,7 +460,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
public void testAppleIWorksKey() throws Exception public void testAppleIWorksKey() throws Exception
{ {
MvcResult result = sendRequest("key", null, MIMETYPE_IWORK_KEYNOTE, MvcResult result = sendRequest("key", null, MIMETYPE_IWORK_KEYNOTE,
"jpeg", MIMETYPE_IMAGE_JPEG, null, null, readTestFile("key")); "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("key"));
assertTrue("Expected image content but content is empty.", assertTrue("Expected image content but content is empty.",
result.getResponse().getContentLengthLong() > 0L); result.getResponse().getContentLengthLong() > 0L);
} }
@ -396,7 +470,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
public void testOOXML() throws Exception public void testOOXML() throws Exception
{ {
MvcResult result = sendRequest("docx", null, MIMETYPE_OPENXML_WORDPROCESSING, MvcResult result = sendRequest("docx", null, MIMETYPE_OPENXML_WORDPROCESSING,
"jpeg", MIMETYPE_IMAGE_JPEG, null, null, readTestFile("docx")); "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("docx"));
assertTrue("Expected image content but content is empty.", assertTrue("Expected image content but content is empty.",
result.getResponse().getContentLengthLong() > 0L); result.getResponse().getContentLengthLong() > 0L);
} }
@ -408,6 +482,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
String targetMimetype, String targetMimetype,
String targetEncoding, String targetEncoding,
String pageLimit, String pageLimit,
String extractMapping,
byte[] content) throws Exception byte[] content) throws Exception
{ {
final MockMultipartFile sourceFile = new MockMultipartFile("file", final MockMultipartFile sourceFile = new MockMultipartFile("file",
@ -433,6 +508,10 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
{ {
requestBuilder.param("pageLimit", pageLimit); requestBuilder.param("pageLimit", pageLimit);
} }
if (extractMapping != null)
{
requestBuilder.param("extractMapping", extractMapping);
}
return mockMvc.perform(requestBuilder) return mockMvc.perform(requestBuilder)
.andExpect(status().is(OK.value())) .andExpect(status().is(OK.value()))

View File

@ -5,6 +5,9 @@
], ],
"stringOptions": [ "stringOptions": [
{"value": {"name": "targetEncoding"}} {"value": {"name": "targetEncoding"}}
],
"metadataOptions": [
{"value": {"name": "extractMapping"}}
] ]
}, },
"transformers": [ "transformers": [
@ -77,6 +80,7 @@
{"sourceMediaType": "application/xhtml+xml", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/xhtml+xml", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -85,6 +89,7 @@
{"sourceMediaType": "message/rfc822", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "message/rfc822", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
} }
] ]

View File

@ -78,8 +78,7 @@ public class HtmlMetadataExtractor extends AbstractMetadataExtractor implements
public void extractMetadata(String sourceMimetype, String targetMimetype, Map<String, String> transformOptions, public void extractMetadata(String sourceMimetype, String targetMimetype, Map<String, String> transformOptions,
File sourceFile, File targetFile) throws Exception File sourceFile, File targetFile) throws Exception
{ {
Map<String, Serializable> metadata = extractMetadata(sourceMimetype, transformOptions, sourceFile); extractMetadata(sourceMimetype, transformOptions, sourceFile, targetFile);
mapMetadataAndWrite(targetFile, metadata);
} }
@Override @Override

View File

@ -86,8 +86,7 @@ public class RFC822MetadataExtractor extends AbstractMetadataExtractor implement
public void extractMetadata(String sourceMimetype, String targetMimetype, Map<String, String> transformOptions, public void extractMetadata(String sourceMimetype, String targetMimetype, Map<String, String> transformOptions,
File sourceFile, File targetFile) throws Exception File sourceFile, File targetFile) throws Exception
{ {
Map<String, Serializable> metadata = extractMetadata(sourceMimetype, transformOptions, sourceFile); extractMetadata(sourceMimetype, transformOptions, sourceFile, targetFile);
mapMetadataAndWrite(targetFile, metadata);
} }
@Override @Override

View File

@ -5,6 +5,9 @@
], ],
"stringOptions": [ "stringOptions": [
{"value": {"name": "targetEncoding"}} {"value": {"name": "targetEncoding"}}
],
"metadataOptions": [
{"value": {"name": "extractMapping"}}
] ]
}, },
"transformers": [ "transformers": [
@ -77,6 +80,7 @@
{"sourceMediaType": "application/xhtml+xml", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/xhtml+xml", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -85,6 +89,7 @@
{"sourceMediaType": "message/rfc822", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "message/rfc822", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
} }
] ]

View File

@ -10,6 +10,9 @@
"pdfboxOptions": [ "pdfboxOptions": [
{"value": {"name": "notExtractBookmarksText"}}, {"value": {"name": "notExtractBookmarksText"}},
{"value": {"name": "targetEncoding"}} {"value": {"name": "targetEncoding"}}
],
"metadataOptions": [
{"value": {"name": "extractMapping"}}
] ]
}, },
"transformers": [ "transformers": [
@ -520,6 +523,7 @@
{"sourceMediaType": "image/x-dwg", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "image/x-dwg", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -528,6 +532,7 @@
{"sourceMediaType": "application/vnd.ms-outlook", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/vnd.ms-outlook", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -536,6 +541,7 @@
{"sourceMediaType": "audio/mpeg", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "audio/mpeg", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -556,6 +562,7 @@
{"sourceMediaType": "application/x-tika-ooxml-protected", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/x-tika-ooxml-protected", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -601,6 +608,7 @@
{"sourceMediaType": "application/x-vnd.oasis.opendocument.graphics", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/x-vnd.oasis.opendocument.graphics", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -610,6 +618,7 @@
{"sourceMediaType": "application/illustrator", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/illustrator", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -645,6 +654,7 @@
{"sourceMediaType": "application/vnd.ms-excel.template.macroenabled.12", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/vnd.ms-excel.template.macroenabled.12", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -662,6 +672,7 @@
{"sourceMediaType": "video/mp4", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "video/mp4", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -968,6 +979,7 @@
{"sourceMediaType": "image/hfa", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "image/hfa", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
} }
] ]

View File

@ -168,8 +168,7 @@ public class TikaJavaExecutor implements JavaExecutor
throws Exception throws Exception
{ {
AbstractTikaMetadataExtractor metadataExtractor = this.metadataExtractor.get(transformName); AbstractTikaMetadataExtractor metadataExtractor = this.metadataExtractor.get(transformName);
Map<String, Serializable> metadata = metadataExtractor.extractMetadata(sourceMimetype, transformOptions, sourceFile); metadataExtractor.extractMetadata(sourceMimetype, transformOptions, sourceFile, targetFile);
metadataExtractor.mapMetadataAndWrite(targetFile, metadata);
} }
/** /**

View File

@ -10,6 +10,9 @@
"pdfboxOptions": [ "pdfboxOptions": [
{"value": {"name": "notExtractBookmarksText"}}, {"value": {"name": "notExtractBookmarksText"}},
{"value": {"name": "targetEncoding"}} {"value": {"name": "targetEncoding"}}
],
"metadataOptions": [
{"value": {"name": "extractMapping"}}
] ]
}, },
"transformers": [ "transformers": [
@ -520,6 +523,7 @@
{"sourceMediaType": "image/x-dwg", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "image/x-dwg", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -528,6 +532,7 @@
{"sourceMediaType": "application/vnd.ms-outlook", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/vnd.ms-outlook", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -536,6 +541,7 @@
{"sourceMediaType": "audio/mpeg", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "audio/mpeg", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -556,6 +562,7 @@
{"sourceMediaType": "application/x-tika-ooxml-protected", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/x-tika-ooxml-protected", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -601,6 +608,7 @@
{"sourceMediaType": "application/x-vnd.oasis.opendocument.graphics", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/x-vnd.oasis.opendocument.graphics", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -610,6 +618,7 @@
{"sourceMediaType": "application/illustrator", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/illustrator", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -645,6 +654,7 @@
{"sourceMediaType": "application/vnd.ms-excel.template.macroenabled.12", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "application/vnd.ms-excel.template.macroenabled.12", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -662,6 +672,7 @@
{"sourceMediaType": "video/mp4", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "video/mp4", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
}, },
{ {
@ -968,6 +979,7 @@
{"sourceMediaType": "image/hfa", "targetMediaType": "alfresco-metadata-extract"} {"sourceMediaType": "image/hfa", "targetMediaType": "alfresco-metadata-extract"}
], ],
"transformOptions": [ "transformOptions": [
"metadataOptions"
] ]
} }
] ]

View File

@ -63,7 +63,7 @@ import java.util.StringTokenizer;
* <li>The T-Engine's Controller class will call a method in a class that extends {@link AbstractMetadataExtractor} * <li>The T-Engine's Controller class will call a method in a class that extends {@link AbstractMetadataExtractor}
* based on the source and target mediatypes in the normal way.</li> * based on the source and target mediatypes in the normal way.</li>
* <li>The method extracts ALL available metadata is extracted from the document and then calls * <li>The method extracts ALL available metadata is extracted from the document and then calls
* {@link #mapMetadataAndWrite(File, Map)}.</li> * {@link #mapMetadataAndWrite(File, Map, Map)}.</li>
* <li>Selected values from the available metadata are mapped into content repository property names and values, * <li>Selected values from the available metadata are mapped into content repository property names and values,
* depending on what is defined in a {@code "<classname>_metadata_extract.properties"} file.</li> * depending on what is defined in a {@code "<classname>_metadata_extract.properties"} file.</li>
* <li>The selected values are set back to the content repository as a JSON representation of a Map, where the values * <li>The selected values are set back to the content repository as a JSON representation of a Map, where the values
@ -95,6 +95,7 @@ public abstract class AbstractMetadataExtractor
private static final String EXTRACT = "extract"; private static final String EXTRACT = "extract";
private static final String EMBED = "embed"; private static final String EMBED = "embed";
private static final String METADATA = "metadata"; private static final String METADATA = "metadata";
private static final String EXTRACT_MAPPING = "extractMapping";
private static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix."; private static final String NAMESPACE_PROPERTY_PREFIX = "namespace.prefix.";
private static final char NAMESPACE_PREFIX = ':'; private static final char NAMESPACE_PREFIX = ':';
@ -110,17 +111,18 @@ public abstract class AbstractMetadataExtractor
private static final ObjectMapper jsonObjectMapper = new ObjectMapper(); private static final ObjectMapper jsonObjectMapper = new ObjectMapper();
protected final Logger logger; protected final Logger logger;
private Map<String, Set<String>> extractMapping; private Map<String, Set<String>> defaultExtractMapping;
private ThreadLocal<Map<String, Set<String>>> extractMapping = new ThreadLocal<>();
private Map<String, Set<String>> embedMapping; private Map<String, Set<String>> embedMapping;
public AbstractMetadataExtractor(Logger logger) public AbstractMetadataExtractor(Logger logger)
{ {
this.logger = logger; this.logger = logger;
extractMapping = Collections.emptyMap(); defaultExtractMapping = Collections.emptyMap();
embedMapping = Collections.emptyMap(); embedMapping = Collections.emptyMap();
try try
{ {
extractMapping = buildExtractMapping(); defaultExtractMapping = buildExtractMapping();
embedMapping = buildEmbedMapping(); embedMapping = buildEmbedMapping();
} }
catch (Exception e) catch (Exception e)
@ -148,7 +150,7 @@ public abstract class AbstractMetadataExtractor
try try
{ {
TypeReference<HashMap<String, String>> typeRef = new TypeReference<HashMap<String, String>>() {}; TypeReference<HashMap<String, String>> typeRef = new TypeReference<>() {};
return jsonObjectMapper.readValue(metadataAsJson, typeRef); return jsonObjectMapper.readValue(metadataAsJson, typeRef);
} }
catch (JsonProcessingException e) catch (JsonProcessingException e)
@ -159,7 +161,7 @@ public abstract class AbstractMetadataExtractor
protected Map<String, Set<String>> getExtractMapping() protected Map<String, Set<String>> getExtractMapping()
{ {
return Collections.unmodifiableMap(extractMapping); return Collections.unmodifiableMap(extractMapping.get());
} }
public Map<String, Set<String>> getEmbedMapping() public Map<String, Set<String>> getEmbedMapping()
@ -432,7 +434,60 @@ public abstract class AbstractMetadataExtractor
return true; return true;
} }
/**
* The {@code transformOptions} may contain a replacement set of mappings. These will be used in place of the
* default mappings from read from file if supplied.
*/
public void extractMetadata(String sourceMimetype, Map<String, String> transformOptions, File sourceFile,
File targetFile) throws Exception
{
Map<String, Set<String>> mapping = getExtractMappingFromOptions(transformOptions, defaultExtractMapping);
// Use a ThreadLocal to avoid changing method signatures of methods that currently call getExtractMapping.
try
{
extractMapping.set(mapping);
Map<String, Serializable> metadata = extractMetadata(sourceMimetype, transformOptions, sourceFile);
mapMetadataAndWrite(targetFile, metadata, mapping);
}
finally
{
extractMapping.set(null);
}
}
private Map<String, Set<String>> getExtractMappingFromOptions(Map<String, String> transformOptions, Map<String,
Set<String>> defaultExtractMapping)
{
String extractMappingOption = transformOptions.get(EXTRACT_MAPPING);
if (extractMappingOption != null)
{
try
{
TypeReference<HashMap<String, Set<String>>> typeRef = new TypeReference<>() {};
return jsonObjectMapper.readValue(extractMappingOption, typeRef);
}
catch (JsonProcessingException e)
{
throw new IllegalArgumentException("Failed to read "+ EXTRACT_MAPPING +" from request", e);
}
}
return defaultExtractMapping;
}
/**
* @deprecated use {@link #extractMetadata(String, Map, File, File)} rather than calling this method.
* By default call the overloaded method with the default {@code extractMapping}.
*/
@Deprecated
public void mapMetadataAndWrite(File targetFile, Map<String, Serializable> metadata) throws IOException public void mapMetadataAndWrite(File targetFile, Map<String, Serializable> metadata) throws IOException
{
mapMetadataAndWrite(targetFile, metadata, defaultExtractMapping);
}
public void mapMetadataAndWrite(File targetFile, Map<String, Serializable> metadata,
Map<String, Set<String>> extractMapping) throws IOException
{ {
if (logger.isDebugEnabled()) if (logger.isDebugEnabled())
{ {
@ -440,17 +495,19 @@ public abstract class AbstractMetadataExtractor
metadata.forEach((k,v) -> logger.debug(" "+k+"="+v)); metadata.forEach((k,v) -> logger.debug(" "+k+"="+v));
} }
metadata = mapRawToSystem(metadata); metadata = mapRawToSystem(metadata, extractMapping);
writeMetadata(targetFile, metadata); writeMetadata(targetFile, metadata);
} }
/** /**
* Based on AbstractMappingMetadataExtracter#mapRawToSystem. * Based on AbstractMappingMetadataExtracter#mapRawToSystem.
* *
* @param rawMetadata Metadata keyed by document properties * @param rawMetadata Metadata keyed by document properties
* @return Returns the metadata keyed by the system properties * @param extractMapping Mapping between document ans system properties
* @return Returns the metadata keyed by the system properties
*/ */
private Map<String, Serializable> mapRawToSystem(Map<String, Serializable> rawMetadata) private Map<String, Serializable> mapRawToSystem(Map<String, Serializable> rawMetadata,
Map<String, Set<String>> extractMapping)
{ {
boolean debugEnabled = logger.isDebugEnabled(); boolean debugEnabled = logger.isDebugEnabled();
if (debugEnabled) if (debugEnabled)

View File

@ -5,7 +5,7 @@
<parent> <parent>
<groupId>org.springframework.boot</groupId> <groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId> <artifactId>spring-boot-starter-parent</artifactId>
<version>2.3.1.RELEASE</version> <version>2.3.5.RELEASE</version>
<relativePath /> <relativePath />
</parent> </parent>
@ -24,9 +24,9 @@
<dependency.alfresco-jodconverter-core.version>3.0.1.1</dependency.alfresco-jodconverter-core.version> <dependency.alfresco-jodconverter-core.version>3.0.1.1</dependency.alfresco-jodconverter-core.version>
<env.project_version>${project.version}</env.project_version> <env.project_version>${project.version}</env.project_version>
<dependency.alfresco-transform-model.version>1.0.2.11</dependency.alfresco-transform-model.version> <dependency.alfresco-transform-model.version>1.0.2.11</dependency.alfresco-transform-model.version>
<dependency.activemq.version>5.15.9</dependency.activemq.version> <dependency.activemq.version>5.15.13</dependency.activemq.version>
<dependency.jackson.version>2.10.3</dependency.jackson.version> <dependency.jackson.version>2.10.3</dependency.jackson.version>
<dependency.cxf.version>3.3.5</dependency.cxf.version> <dependency.cxf.version>3.4.1</dependency.cxf.version>
<dependency.tika.version>1.24.1</dependency.tika.version> <dependency.tika.version>1.24.1</dependency.tika.version>
<dependency.poi.version>4.1.2</dependency.poi.version> <dependency.poi.version>4.1.2</dependency.poi.version>
<dependency.ooxml-schemas.version>1.4</dependency.ooxml-schemas.version> <dependency.ooxml-schemas.version>1.4</dependency.ooxml-schemas.version>