diff --git a/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/test/java/org/alfresco/transformer/AIOControllerTikaTest.java b/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/test/java/org/alfresco/transformer/AIOControllerTikaTest.java index 85600986..739e458d 100644 --- a/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/test/java/org/alfresco/transformer/AIOControllerTikaTest.java +++ b/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/test/java/org/alfresco/transformer/AIOControllerTikaTest.java @@ -75,7 +75,7 @@ public class AIOControllerTikaTest extends TikaControllerTest // Ignore the test in super class as the way the AIO transformer provides config is fundamentally different. } - + @Test @Override public void testGetInfoFromConfigWithNoTransformOptions() @@ -83,4 +83,12 @@ public class AIOControllerTikaTest extends TikaControllerTest // Ignore the test in super class as the way the AIO transformer provides config is fundamentally different. } + + @Test + @Override + public void xlsxEmbedTest() + { + // Ignore the test in super class as the way the AIO transformer provides config is fundamentally different. + // It uses the real class path rather than the test one. + } } \ No newline at end of file diff --git a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/tika_engine_config.json b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/tika_engine_config.json index 39f4a8c1..b6d534b7 100644 --- a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/tika_engine_config.json +++ b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/resources/tika_engine_config.json @@ -13,6 +13,10 @@ ], "metadataOptions": [ {"value": {"name": "extractMapping"}} + ], + "metadataEmbedOptions": [ + {"value": {"name": "metadata", "required": true}}, + {"value": {"name": "targetEncoding"}} ] }, "transformers": [ @@ -981,6 +985,15 @@ "transformOptions": [ "metadataOptions" ] + }, + { + "transformerName": "SamplePoiMetadataEmbedder", + "supportedSourceAndTargetList": [ + {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"} + ], + "transformOptions": [ + "metadataEmbedOptions" + ] } ] } \ No newline at end of file diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java index be636556..062cc6c8 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java @@ -2,7 +2,7 @@ * #%L * Alfresco Transform Core * %% - * Copyright (C) 2005 - 2020 Alfresco Software Limited + * Copyright (C) 2005 - 2021 Alfresco Software Limited * %% * This file is part of the Alfresco software. * - @@ -32,6 +32,8 @@ import org.alfresco.transformer.executors.RuntimeExec; import org.alfresco.transformer.model.FileRefEntity; import org.alfresco.transformer.model.FileRefResponse; import org.alfresco.transformer.probes.ProbeTestTransform; +import org.apache.poi.ooxml.POIXMLProperties; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; @@ -51,6 +53,7 @@ import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilde import org.springframework.test.web.servlet.request.MockMvcRequestBuilders; import javax.servlet.http.HttpServletRequest; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.util.HashMap; @@ -79,6 +82,7 @@ import static org.alfresco.transformer.executors.Tika.XML; import static org.alfresco.transformer.executors.Tika.XSLX; import static org.alfresco.transformer.executors.Tika.ZIP; import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_HTML; +import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_METADATA_EMBED; import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_OPENXML_PRESENTATION; import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET; import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING; @@ -539,6 +543,43 @@ public class TikaControllerTest extends AbstractTransformerControllerTest EXPECTED_TEXT_CONTENT_CONTAINS); } + @Test + public void xlsxEmbedTest() throws Exception + { + mockTransformCommand(XSLX, XSLX, MIMETYPE_OPENXML_SPREADSHEET, false); + + String metadata = + "{\"{http://www.alfresco.org/model/content/1.0}author\":\"author1\"," + + "\"{http://www.alfresco.org/model/content/1.0}title\":\"title1\"," + + "\"{http://www.alfresco.org/model/content/1.0}description\":[\"desc1\",\"desc2\"]," + + "\"{http://www.alfresco.org/model/content/1.0}created\":\"created1\"}"; + + MockHttpServletRequestBuilder requestBuilder = + super.mockMvcRequest("/transform", sourceFile, + "targetExtension", XSLX, + "metadata", metadata, + "targetMimetype", MIMETYPE_METADATA_EMBED, + "sourceMimetype", MIMETYPE_OPENXML_SPREADSHEET); + + MvcResult result = mockMvc.perform(requestBuilder) + .andExpect(status().is(OK.value())) + .andExpect(header().string("Content-Disposition", + "attachment; filename*= UTF-8''quick." + targetExtension)). + andReturn(); + + byte[] bytes = result.getResponse().getContentAsByteArray(); + ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes); + XSSFWorkbook workbook = new XSSFWorkbook(inputStream); + POIXMLProperties props = workbook.getProperties(); + POIXMLProperties.CoreProperties coreProp = props.getCoreProperties(); + POIXMLProperties.CustomProperties custProp = props.getCustomProperties(); + + assertEquals("author1", coreProp.getCreator()); + assertEquals("title1", coreProp.getTitle()); + assertEquals("desc1, desc2", coreProp.getDescription()); // multi value + assertEquals("created1", custProp.getProperty("created").getLpwstr()); + } + @Test public void pdfToTxtExtractBookmarksTest() throws Exception { diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/tika_engine_config.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/tika_engine_config.json index 39f4a8c1..b6d534b7 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/tika_engine_config.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/tika_engine_config.json @@ -13,6 +13,10 @@ ], "metadataOptions": [ {"value": {"name": "extractMapping"}} + ], + "metadataEmbedOptions": [ + {"value": {"name": "metadata", "required": true}}, + {"value": {"name": "targetEncoding"}} ] }, "transformers": [ @@ -981,6 +985,15 @@ "transformOptions": [ "metadataOptions" ] + }, + { + "transformerName": "SamplePoiMetadataEmbedder", + "supportedSourceAndTargetList": [ + {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"} + ], + "transformOptions": [ + "metadataEmbedOptions" + ] } ] } \ No newline at end of file diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java index 2faf0979..ad517c4c 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java @@ -2,7 +2,7 @@ * #%L * Alfresco Transform Core * %% - * Copyright (C) 2005 - 2020 Alfresco Software Limited + * Copyright (C) 2005 - 2021 Alfresco Software Limited * %% * This file is part of the Alfresco software. * - @@ -44,7 +44,6 @@ import org.xml.sax.SAXException; import java.io.File; import java.io.IOException; -import java.io.Serializable; import java.util.ArrayList; import java.util.Map; import java.util.StringJoiner; @@ -81,6 +80,7 @@ public class TikaJavaExecutor implements JavaExecutor .build(); private final Map metadataEmbedder = ImmutableMap .builder() + .put("SamplePoiMetadataEmbedder", new PoiMetadataExtractor()) .build(); public TikaJavaExecutor() @@ -119,7 +119,7 @@ public class TikaJavaExecutor implements JavaExecutor } @Override - public void call(File sourceFile, File targetFile, String... args) throws Exception + public void call(File sourceFile, File targetFile, String... args) { args = buildArgs(sourceFile, targetFile, args); tika.transform(args); diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractTikaMetadataExtractor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractTikaMetadataExtractor.java index 9d341aba..4a78ae00 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractTikaMetadataExtractor.java +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractTikaMetadataExtractor.java @@ -2,7 +2,7 @@ * #%L * Alfresco Transform Core * %% - * Copyright (C) 2005 - 2020 Alfresco Software Limited + * Copyright (C) 2005 - 2021 Alfresco Software Limited * %% * This file is part of the Alfresco software. * - @@ -53,6 +53,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.Serializable; import java.util.Arrays; +import java.util.Collection; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.Locale; @@ -316,9 +317,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr return; } - Metadata metadataToEmbed = new Metadata(); - Map metadataAsStrings = getMetadata(transformOptions); - metadataAsStrings.forEach((k,v)->metadataToEmbed.add(k, v)); + Metadata metadataToEmbed = getTikaMetadata(transformOptions); try (InputStream inputStream = new FileInputStream(sourceFile); OutputStream outputStream = new FileOutputStream(targetFile)) @@ -327,6 +326,46 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr } } + private Metadata getTikaMetadata(Map transformOptions) + { + Metadata metadataToEmbed = new Metadata(); + Map properties = getMetadata(transformOptions); + for (String metadataKey : properties.keySet()) + { + Serializable value = properties.get(metadataKey); + if (value == null) + { + continue; + } + if (value instanceof Collection) + { + for (Object singleValue : (Collection) value) + { + try + { + metadataToEmbed.add(metadataKey, (String)singleValue); + } + catch (ClassCastException e) + { + logger.info("Could not convert " + metadataKey + ": " + e.getMessage()); + } + } + } + else + { + try + { + metadataToEmbed.add(metadataKey, (String)value); + } + catch (ClassCastException e) + { + logger.info("Could not convert " + metadataKey + ": " + e.getMessage()); + } + } + } + return metadataToEmbed; + } + private Serializable getMetadataValues(Metadata metadata, String key) { // Use Set to prevent duplicates. diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/PoiMetadataExtractor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/PoiMetadataExtractor.java index 56981c8e..71cdd7a8 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/PoiMetadataExtractor.java +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/PoiMetadataExtractor.java @@ -2,7 +2,7 @@ * #%L * Alfresco Transform Core * %% - * Copyright (C) 2005 - 2020 Alfresco Software Limited + * Copyright (C) 2005 - 2021 Alfresco Software Limited * %% * This file is part of the Alfresco software. * - @@ -26,11 +26,24 @@ */ package org.alfresco.transformer.metadataExtractors; +import org.apache.poi.ooxml.POIXMLProperties; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.apache.tika.embedder.Embedder; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Collections; +import java.util.Set; +import java.util.StringJoiner; + /** * POI-based metadata extractor for Office 07 documents. See http://poi.apache.org/ for information on POI. * @@ -46,6 +59,34 @@ import org.slf4j.LoggerFactory; * * Uses Apache Tika * + * Also includes a sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add + * metadata. This is not production code so no supported mimetypes exist in the {@code tika_engine_config.json}. + * Adding the following would make it available: + * + *
+ * {
+ *   "transformOptions": {
+ *     ...
+ *     "metadataEmbedOptions": [
+ *       {"value": {"name": "metadata", "required": true}}
+ *     ]
+ *   },
+ *   "transformers": [
+ *     ...
+ *     {
+ *       "transformerName": "SamplePoiMetadataEmbedder",
+ *       "supportedSourceAndTargetList": [
+ *         ...
+ *         {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
+ *       ],
+ *       "transformOptions": [
+ *         "metadataEmbedOptions"
+ *       ]
+ *     }
+ *   ]
+ * }
+ * 
+ * @author Nick Burch * @author Neil McErlean * @author Dmitry Velichkevich @@ -65,4 +106,70 @@ public class PoiMetadataExtractor extends AbstractTikaMetadataExtractor { return new OOXMLParser(); } + + @Override + protected Embedder getEmbedder() + { + return new SamplePoiEmbedder(); + } + + private static class SamplePoiEmbedder implements Embedder + { + private static final Set SUPPORTED_EMBED_TYPES = + Collections.singleton(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")); + + @Override + public Set getSupportedEmbedTypes(ParseContext parseContext) + { + return SUPPORTED_EMBED_TYPES; + } + + @Override + public void embed(Metadata metadata, InputStream inputStream, OutputStream outputStream, ParseContext parseContext) + throws IOException + { + XSSFWorkbook workbook = new XSSFWorkbook(inputStream); + POIXMLProperties props = workbook.getProperties(); + + POIXMLProperties.CoreProperties coreProp = props.getCoreProperties(); + POIXMLProperties.CustomProperties custProp = props.getCustomProperties(); + + for (String name : metadata.names()) + { + metadata.isMultiValued("description"); + String value = null; + if (metadata.isMultiValued(name)) + { + String[] values = metadata.getValues(name); + StringJoiner sj = new StringJoiner(", "); + for (String s : values) + { + sj.add(s); + } + value = sj.toString(); + } + else + { + value = metadata.get(name); + } + switch (name) + { + case "author": + coreProp.setCreator(value); + break; + case "title": + coreProp.setTitle(value); + break; + case "description": + coreProp.setDescription(value); + break; + // There are other core values but this is sample code, so we will assume it is a custom value. + default: + custProp.addProperty(name, value); + break; + } + } + workbook.write(outputStream); + } + } } diff --git a/alfresco-transformer-base/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractMetadataExtractor.java b/alfresco-transformer-base/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractMetadataExtractor.java index c4589303..0c9e63d3 100644 --- a/alfresco-transformer-base/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractMetadataExtractor.java +++ b/alfresco-transformer-base/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractMetadataExtractor.java @@ -2,7 +2,7 @@ * #%L * Alfresco Transform Core * %% - * Copyright (C) 2005-2020 Alfresco Software Limited + * Copyright (C) 2005-2021 Alfresco Software Limited * %% * This file is part of the Alfresco software. * - @@ -140,7 +140,7 @@ public abstract class AbstractMetadataExtractor // Default nothing, as embedding is not supported in most cases } - protected Map getMetadata(Map transformOptions) + protected Map getMetadata(Map transformOptions) { String metadataAsJson = transformOptions.get(METADATA); if (metadataAsJson == null) @@ -150,8 +150,10 @@ public abstract class AbstractMetadataExtractor try { - TypeReference> typeRef = new TypeReference<>() {}; - return jsonObjectMapper.readValue(metadataAsJson, typeRef); + TypeReference> typeRef = new TypeReference<>() {}; + HashMap systemProperties = jsonObjectMapper.readValue(metadataAsJson, typeRef); + Map rawProperties = mapSystemToRaw(systemProperties); + return rawProperties; } catch (JsonProcessingException e) { @@ -159,6 +161,36 @@ public abstract class AbstractMetadataExtractor } } + private Map mapSystemToRaw(Map systemMetadata) + { + Map metadataProperties = new HashMap<>(systemMetadata.size() * 2 + 1); + for (Map.Entry entry : systemMetadata.entrySet()) + { + String modelProperty = entry.getKey(); + // Check if there is a mapping for this + if (!embedMapping.containsKey(modelProperty)) + { + // No mapping - ignore + continue; + } + Serializable documentValue = entry.getValue(); + Set metadataKeys = embedMapping.get(modelProperty); + for (String metadataKey : metadataKeys) + { + metadataProperties.put(metadataKey, documentValue); + } + } + // Done + if (logger.isDebugEnabled()) + { + logger.debug( + "Converted system model values to metadata values: \n" + + " System Properties: " + systemMetadata + "\n" + + " Metadata Properties: " + metadataProperties); + } + return metadataProperties; + } + protected Map> getExtractMapping() { return Collections.unmodifiableMap(extractMapping.get());