diff --git a/engines/aio/src/test/resources/tika_engine_config.json b/engines/aio/src/test/resources/tika_engine_config.json index b6d534b7..a038950c 100644 --- a/engines/aio/src/test/resources/tika_engine_config.json +++ b/engines/aio/src/test/resources/tika_engine_config.json @@ -987,7 +987,7 @@ ] }, { - "transformerName": "SamplePoiMetadataEmbedder", + "transformerName": "PoiMetadataEmbedder", "supportedSourceAndTargetList": [ {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"} ], diff --git a/engines/base/src/main/java/org/alfresco/transform/base/TransformHandler.java b/engines/base/src/main/java/org/alfresco/transform/base/TransformHandler.java index 104f2c4b..885db426 100644 --- a/engines/base/src/main/java/org/alfresco/transform/base/TransformHandler.java +++ b/engines/base/src/main/java/org/alfresco/transform/base/TransformHandler.java @@ -189,7 +189,7 @@ public class TransformHandler MultipartFile sourceMultipartFile, String sourceMimetype, String targetMimetype, Map requestParameters) { - return createResponseEntity(targetMimetype, os -> + return createResponseEntity(sourceMimetype, targetMimetype, os -> { new TransformProcess(this, sourceMimetype, targetMimetype, requestParameters, "e" + httpRequestCount.getAndIncrement()) @@ -216,7 +216,7 @@ public class TransformHandler @Override protected OutputStream getOutputStream() { - return transformManager.setOutputStream(os); + return os; } @Override @@ -538,10 +538,10 @@ public class TransformHandler return customTransformer; } - private ResponseEntity createResponseEntity(String targetMimetype, + private ResponseEntity createResponseEntity(String sourceMimetype, String targetMimetype, StreamingResponseBody body) { - String extension = ExtensionService.getExtensionForMimetype(targetMimetype); + String extension = ExtensionService.getExtensionForTargetMimetype(targetMimetype, sourceMimetype); HttpHeaders headers = new HttpHeaders(); headers.setContentDisposition( ContentDisposition.attachment() diff --git a/engines/base/src/main/java/org/alfresco/transform/base/TransformProcess.java b/engines/base/src/main/java/org/alfresco/transform/base/TransformProcess.java index 57736d39..9898d7cd 100644 --- a/engines/base/src/main/java/org/alfresco/transform/base/TransformProcess.java +++ b/engines/base/src/main/java/org/alfresco/transform/base/TransformProcess.java @@ -36,6 +36,7 @@ import org.springframework.web.multipart.MultipartFile; import javax.jms.Destination; import javax.servlet.http.HttpServletRequest; import java.io.File; +import java.io.OutputStream; import java.util.Map; import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR; @@ -70,7 +71,6 @@ abstract class TransformProcess extends TransformStreamHandler transformHandler.getProbeTransform().incrementTransformerCount(); } - @Override public void handleTransformRequest() { transformManager.setSourceMimetype(sourceMimetype); diff --git a/engines/base/src/main/java/org/alfresco/transform/base/metadataExtractors/AbstractMetadataExtractor.java b/engines/base/src/main/java/org/alfresco/transform/base/metadataExtractors/AbstractMetadataExtractor.java index 7d765f92..10571fb6 100644 --- a/engines/base/src/main/java/org/alfresco/transform/base/metadataExtractors/AbstractMetadataExtractor.java +++ b/engines/base/src/main/java/org/alfresco/transform/base/metadataExtractors/AbstractMetadataExtractor.java @@ -374,6 +374,8 @@ public abstract class AbstractMetadataExtractor implements CustomTransformer String className = this.getClass().getName(); String shortClassName = className.split("\\.")[className.split("\\.").length - 1]; shortClassName = shortClassName.replace('$', '-'); + // The embedder uses the reverse of the extractor's data. + shortClassName = shortClassName.replace("Embedder", "Extractor"); return shortClassName + "_metadata_" + suffix + ".properties"; } diff --git a/engines/tika/src/main/java/org/alfresco/transform/tika/metadataExtractors/PoiMetadataEmbedder.java b/engines/tika/src/main/java/org/alfresco/transform/tika/metadataExtractors/PoiMetadataEmbedder.java new file mode 100644 index 00000000..8102547d --- /dev/null +++ b/engines/tika/src/main/java/org/alfresco/transform/tika/metadataExtractors/PoiMetadataEmbedder.java @@ -0,0 +1,165 @@ +/* + * #%L + * Alfresco Transform Core + * %% + * Copyright (C) 2005 - 2022 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * - + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * - + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * - + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * - + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ +package org.alfresco.transform.tika.metadataExtractors; + +import org.apache.poi.ooxml.POIXMLProperties; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.apache.tika.embedder.Embedder; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Component; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Collections; +import java.util.Set; +import java.util.StringJoiner; + +import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR; + +/** + * Sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add + * metadata. This is not production code, so no supported mimetypes exist in the {@code tika_engine_config.json}. + * Adding the following would make it available: + * + *
+ * {
+ *   "transformOptions": {
+ *     ...
+ *     "metadataEmbedOptions": [
+ *       {"value": {"name": "metadata", "required": true}}
+ *     ]
+ *   },
+ *   "transformers": [
+ *     ...
+ *     {
+ *       "transformerName": "PoiMetadataEmbedder",
+ *       "supportedSourceAndTargetList": [
+ *         ...
+ *         {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
+ *       ],
+ *       "transformOptions": [
+ *         "metadataEmbedOptions"
+ *       ]
+ *     }
+ *   ]
+ * }
+ * 
+ + * @author Nick Burch + * @author Neil McErlean + * @author Dmitry Velichkevich + * @author adavis + */ +@Component +public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractor +{ + private static final Logger logger = LoggerFactory.getLogger(PoiMetadataEmbedder.class); + + public PoiMetadataEmbedder() + { + super(EXTRACTOR, logger); + } + + @Override + protected Parser getParser() + { + return new OOXMLParser(); + } + + @Override + protected Embedder getEmbedder() + { + return new SamplePoiEmbedder(); + } + + private static class SamplePoiEmbedder implements Embedder + { + private static final Set SUPPORTED_EMBED_TYPES = + Collections.singleton(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")); + + @Override + public Set getSupportedEmbedTypes(ParseContext parseContext) + { + return SUPPORTED_EMBED_TYPES; + } + + @Override + public void embed(Metadata metadata, InputStream inputStream, OutputStream outputStream, ParseContext parseContext) + throws IOException + { + XSSFWorkbook workbook = new XSSFWorkbook(inputStream); + POIXMLProperties props = workbook.getProperties(); + + POIXMLProperties.CoreProperties coreProp = props.getCoreProperties(); + POIXMLProperties.CustomProperties custProp = props.getCustomProperties(); + + for (String name : metadata.names()) + { + metadata.isMultiValued("description"); + String value = null; + if (metadata.isMultiValued(name)) + { + String[] values = metadata.getValues(name); + StringJoiner sj = new StringJoiner(", "); + for (String s : values) + { + sj.add(s); + } + value = sj.toString(); + } + else + { + value = metadata.get(name); + } + switch (name) + { + case "author": + coreProp.setCreator(value); + break; + case "title": + coreProp.setTitle(value); + break; + case "description": + coreProp.setDescription(value); + break; + // There are other core values but this is sample code, so we will assume it is a custom value. + default: + custProp.addProperty(name, value); + break; + } + } + workbook.write(outputStream); + } + } +} diff --git a/engines/tika/src/main/java/org/alfresco/transform/tika/metadataExtractors/PoiMetadataExtractor.java b/engines/tika/src/main/java/org/alfresco/transform/tika/metadataExtractors/PoiMetadataExtractor.java index c110c3d5..9872ee4e 100644 --- a/engines/tika/src/main/java/org/alfresco/transform/tika/metadataExtractors/PoiMetadataExtractor.java +++ b/engines/tika/src/main/java/org/alfresco/transform/tika/metadataExtractors/PoiMetadataExtractor.java @@ -59,36 +59,6 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt * created: -- cm:created * Any custom property: -- [not mapped] * - * - * Uses Apache Tika - * - * Also includes a sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add - * metadata. This is not production code so no supported mimetypes exist in the {@code tika_engine_config.json}. - * Adding the following would make it available: - * - *
- * {
- *   "transformOptions": {
- *     ...
- *     "metadataEmbedOptions": [
- *       {"value": {"name": "metadata", "required": true}}
- *     ]
- *   },
- *   "transformers": [
- *     ...
- *     {
- *       "transformerName": "SamplePoiMetadataEmbedder",
- *       "supportedSourceAndTargetList": [
- *         ...
- *         {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
- *       ],
- *       "transformOptions": [
- *         "metadataEmbedOptions"
- *       ]
- *     }
- *   ]
- * }
- * 
* @author Nick Burch * @author Neil McErlean @@ -110,70 +80,4 @@ public class PoiMetadataExtractor extends AbstractTikaMetadataExtractor { return new OOXMLParser(); } - - @Override - protected Embedder getEmbedder() - { - return new SamplePoiEmbedder(); - } - - private static class SamplePoiEmbedder implements Embedder - { - private static final Set SUPPORTED_EMBED_TYPES = - Collections.singleton(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")); - - @Override - public Set getSupportedEmbedTypes(ParseContext parseContext) - { - return SUPPORTED_EMBED_TYPES; - } - - @Override - public void embed(Metadata metadata, InputStream inputStream, OutputStream outputStream, ParseContext parseContext) - throws IOException - { - XSSFWorkbook workbook = new XSSFWorkbook(inputStream); - POIXMLProperties props = workbook.getProperties(); - - POIXMLProperties.CoreProperties coreProp = props.getCoreProperties(); - POIXMLProperties.CustomProperties custProp = props.getCustomProperties(); - - for (String name : metadata.names()) - { - metadata.isMultiValued("description"); - String value = null; - if (metadata.isMultiValued(name)) - { - String[] values = metadata.getValues(name); - StringJoiner sj = new StringJoiner(", "); - for (String s : values) - { - sj.add(s); - } - value = sj.toString(); - } - else - { - value = metadata.get(name); - } - switch (name) - { - case "author": - coreProp.setCreator(value); - break; - case "title": - coreProp.setTitle(value); - break; - case "description": - coreProp.setDescription(value); - break; - // There are other core values but this is sample code, so we will assume it is a custom value. - default: - custProp.addProperty(name, value); - break; - } - } - workbook.write(outputStream); - } - } } diff --git a/engines/tika/src/main/java/org/alfresco/transform/tika/transformers/Tika.java b/engines/tika/src/main/java/org/alfresco/transform/tika/transformers/Tika.java index 2fa22ad6..ef2459f1 100644 --- a/engines/tika/src/main/java/org/alfresco/transform/tika/transformers/Tika.java +++ b/engines/tika/src/main/java/org/alfresco/transform/tika/transformers/Tika.java @@ -54,14 +54,12 @@ import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; -import java.io.BufferedInputStream; import java.io.BufferedWriter; -import java.io.FileInputStream; -import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.URL; import java.util.List; @@ -103,7 +101,7 @@ public class Tika public static final String PPTX = "pptx"; public static final String TXT = "txt"; public static final String XHTML = "xhtml"; - public static final String XSLX = "xslx"; + public static final String XLSX = "xlsx"; public static final String XML = "xml"; public static final String ZIP = "zip"; @@ -236,6 +234,10 @@ public class Tika parser.parse(inputStream, handler, metadata, context); } + catch (UnsupportedEncodingException e) + { + throw new IllegalStateException("Unsupported encoding "+e.getMessage(), e); + } catch (SAXException | TikaException | IOException e) { throw new IllegalStateException(e.getMessage(), e); diff --git a/engines/tika/src/test/java/org/alfresco/transform/tika/TikaTest.java b/engines/tika/src/test/java/org/alfresco/transform/tika/TikaTest.java index fe05c669..b31b441f 100644 --- a/engines/tika/src/test/java/org/alfresco/transform/tika/TikaTest.java +++ b/engines/tika/src/test/java/org/alfresco/transform/tika/TikaTest.java @@ -34,11 +34,9 @@ import org.alfresco.transform.client.model.TransformReply; import org.alfresco.transform.client.model.TransformRequest; import org.apache.poi.ooxml.POIXMLProperties; import org.apache.poi.xssf.usermodel.XSSFWorkbook; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.mockito.Mock; -import org.mockito.stubbing.Answer; import org.springframework.core.io.FileSystemResource; import org.springframework.core.io.Resource; import org.springframework.http.HttpHeaders; @@ -53,10 +51,8 @@ import javax.servlet.http.HttpServletRequest; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; -import java.util.Map; import java.util.UUID; -import static java.nio.file.Files.readAllBytes; import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML; import static org.alfresco.transform.common.Mimetype.MIMETYPE_METADATA_EMBED; import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_PRESENTATION; @@ -91,14 +87,13 @@ import static org.alfresco.transform.tika.transformers.Tika.TIKA_AUTO; import static org.alfresco.transform.tika.transformers.Tika.TXT; import static org.alfresco.transform.tika.transformers.Tika.XHTML; import static org.alfresco.transform.tika.transformers.Tika.XML; -import static org.alfresco.transform.tika.transformers.Tika.XSLX; +import static org.alfresco.transform.tika.transformers.Tika.XLSX; import static org.alfresco.transform.tika.transformers.Tika.ZIP; import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.anyLong; import static org.mockito.Mockito.when; import static org.springframework.http.HttpHeaders.ACCEPT; import static org.springframework.http.HttpHeaders.CONTENT_DISPOSITION; @@ -109,7 +104,8 @@ import static org.springframework.http.HttpStatus.OK; import static org.springframework.http.MediaType.APPLICATION_JSON_VALUE; import static org.springframework.http.MediaType.APPLICATION_PDF_VALUE; import static org.springframework.http.MediaType.TEXT_PLAIN_VALUE; -import static org.springframework.util.StringUtils.getFilenameExtension; +import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.asyncDispatch; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.request; /** * Test Tika. @@ -179,7 +175,11 @@ public class TikaTest extends AbstractBaseTest "targetExtension", this.targetExtension) : mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile, "targetExtension", this.targetExtension, INCLUDE_CONTENTS, includeContents.toString()); - MvcResult result = mockMvc.perform(requestBuilder) + MvcResult mvcResult = mockMvc.perform(requestBuilder) + .andExpect(request().asyncStarted()) + .andReturn(); + + MvcResult result = mockMvc.perform(asyncDispatch(mvcResult)) .andExpect(MockMvcResultMatchers.status().is(OK.value())) .andExpect(MockMvcResultMatchers.header().string("Content-Disposition", "attachment; filename*=UTF-8''transform." + this.targetExtension)). @@ -252,9 +252,17 @@ public class TikaTest extends AbstractBaseTest { mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true); targetEncoding = "rubbish"; - mockMvc.perform( +// mockMvc.perform( +// mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile, "targetExtension", targetExtension)) +// .andExpect(MockMvcResultMatchers.status().is(INTERNAL_SERVER_ERROR.value())); + + MvcResult mvcResult = mockMvc.perform( mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile, "targetExtension", targetExtension)) - .andExpect(MockMvcResultMatchers.status().is(INTERNAL_SERVER_ERROR.value())); + .andExpect(request().asyncStarted()) + .andReturn(); + + mockMvc.perform(asyncDispatch(mvcResult)) + .andExpect(MockMvcResultMatchers.status().is(INTERNAL_SERVER_ERROR.value())); } // --- Archive --- @@ -381,7 +389,7 @@ public class TikaTest extends AbstractBaseTest @Test public void xslxToCsvPoiTest() throws Exception { - transform(POI, XSLX, CSV, MIMETYPE_OPENXML_SPREADSHEET, MIMETYPE_TEXT_CSV, null, + transform(POI, XLSX, CSV, MIMETYPE_OPENXML_SPREADSHEET, MIMETYPE_TEXT_CSV, null, EXPECTED_CSV_CONTENT_CONTAINS); } @@ -429,7 +437,7 @@ public class TikaTest extends AbstractBaseTest @Test public void xlsxEmbedTest() throws Exception { - mockTransformCommand(XSLX, XSLX, MIMETYPE_OPENXML_SPREADSHEET, false); + mockTransformCommand(XLSX, XLSX, MIMETYPE_OPENXML_SPREADSHEET, false); String metadata = "{\"{http://www.alfresco.org/model/content/1.0}author\":\"author1\"," + @@ -439,12 +447,16 @@ public class TikaTest extends AbstractBaseTest MockHttpServletRequestBuilder requestBuilder = super.mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile, - "targetExtension", XSLX, + "targetExtension", XLSX, "metadata", metadata, "targetMimetype", MIMETYPE_METADATA_EMBED, "sourceMimetype", MIMETYPE_OPENXML_SPREADSHEET); - MvcResult result = mockMvc.perform(requestBuilder) + MvcResult mvcResult = mockMvc.perform(requestBuilder) + .andExpect(request().asyncStarted()) + .andReturn(); + + MvcResult result = mockMvc.perform(asyncDispatch(mvcResult)) .andExpect(MockMvcResultMatchers.status().is(OK.value())) .andExpect(MockMvcResultMatchers.header().string("Content-Disposition", "attachment; filename*=UTF-8''transform." + targetExtension)). diff --git a/engines/tika/src/test/java/org/alfresco/transform/tika/TikaTransformationIT.java b/engines/tika/src/test/java/org/alfresco/transform/tika/TikaTransformationIT.java index dfbbc29d..10153ecb 100644 --- a/engines/tika/src/test/java/org/alfresco/transform/tika/TikaTransformationIT.java +++ b/engines/tika/src/test/java/org/alfresco/transform/tika/TikaTransformationIT.java @@ -152,7 +152,7 @@ public class TikaTransformationIT allTargets("quick.txt", "text/plain"), allTargets("quick.vsd", "application/vnd.visio"), allTargets("quick.xls", "application/vnd.ms-excel"), - allTargets("quick.xslx", + allTargets("quick.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), allTargets("quick.zip", "application/zip"), allTargets("quick.tar", "application/x-tar"), diff --git a/engines/tika/src/test/resources/quick.xslx b/engines/tika/src/test/resources/quick.xslx deleted file mode 100644 index 2e1f271e..00000000 Binary files a/engines/tika/src/test/resources/quick.xslx and /dev/null differ diff --git a/engines/tika/src/test/resources/tika_engine_config.json b/engines/tika/src/test/resources/tika_engine_config.json index b6d534b7..a038950c 100644 --- a/engines/tika/src/test/resources/tika_engine_config.json +++ b/engines/tika/src/test/resources/tika_engine_config.json @@ -987,7 +987,7 @@ ] }, { - "transformerName": "SamplePoiMetadataEmbedder", + "transformerName": "PoiMetadataEmbedder", "supportedSourceAndTargetList": [ {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"} ], diff --git a/model/src/main/java/org/alfresco/transform/common/ExtensionService.java b/model/src/main/java/org/alfresco/transform/common/ExtensionService.java index 23d03d06..8f4fc536 100644 --- a/model/src/main/java/org/alfresco/transform/common/ExtensionService.java +++ b/model/src/main/java/org/alfresco/transform/common/ExtensionService.java @@ -58,6 +58,7 @@ import static org.alfresco.transform.common.Mimetype.MIMETYPE_VISIO_2013; import static org.alfresco.transform.common.Mimetype.MIMETYPE_WORD; import static org.alfresco.transform.common.Mimetype.MIMETYPE_WORDPERFECT; import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN; +import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML; import static org.alfresco.transform.common.TransformerDebug.MIMETYPE_METADATA_EMBED; import static org.alfresco.transform.common.TransformerDebug.MIMETYPE_METADATA_EXTRACT; @@ -116,7 +117,8 @@ public class ExtensionService Map.entry(MIMETYPE_DITA, "dita"), Map.entry(MIMETYPE_OPENXML_SPREADSHEET_TEMPLATE, "xltx"), Map.entry(MIMETYPE_IMAGE_SVG, "svg"), - Map.entry(MIMETYPE_TEXT_PLAIN, "txt") + Map.entry(MIMETYPE_TEXT_PLAIN, "txt"), + Map.entry(MIMETYPE_XHTML, "xhtml") ); public static String getExtensionForTargetMimetype(String targetMimetype, String sourceMimetype)