Save point: [skip ci]

* TikaTests
This commit is contained in:
alandavis
2022-07-25 17:20:13 +01:00
parent 17b294b8a0
commit e06989e544
12 changed files with 210 additions and 123 deletions

View File

@@ -987,7 +987,7 @@
]
},
{
"transformerName": "SamplePoiMetadataEmbedder",
"transformerName": "PoiMetadataEmbedder",
"supportedSourceAndTargetList": [
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
],

View File

@@ -189,7 +189,7 @@ public class TransformHandler
MultipartFile sourceMultipartFile, String sourceMimetype, String targetMimetype,
Map<String, String> requestParameters)
{
return createResponseEntity(targetMimetype, os ->
return createResponseEntity(sourceMimetype, targetMimetype, os ->
{
new TransformProcess(this, sourceMimetype, targetMimetype, requestParameters,
"e" + httpRequestCount.getAndIncrement())
@@ -216,7 +216,7 @@ public class TransformHandler
@Override
protected OutputStream getOutputStream()
{
return transformManager.setOutputStream(os);
return os;
}
@Override
@@ -538,10 +538,10 @@ public class TransformHandler
return customTransformer;
}
private ResponseEntity<StreamingResponseBody> createResponseEntity(String targetMimetype,
private ResponseEntity<StreamingResponseBody> createResponseEntity(String sourceMimetype, String targetMimetype,
StreamingResponseBody body)
{
String extension = ExtensionService.getExtensionForMimetype(targetMimetype);
String extension = ExtensionService.getExtensionForTargetMimetype(targetMimetype, sourceMimetype);
HttpHeaders headers = new HttpHeaders();
headers.setContentDisposition(
ContentDisposition.attachment()

View File

@@ -36,6 +36,7 @@ import org.springframework.web.multipart.MultipartFile;
import javax.jms.Destination;
import javax.servlet.http.HttpServletRequest;
import java.io.File;
import java.io.OutputStream;
import java.util.Map;
import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
@@ -70,7 +71,6 @@ abstract class TransformProcess extends TransformStreamHandler
transformHandler.getProbeTransform().incrementTransformerCount();
}
@Override
public void handleTransformRequest()
{
transformManager.setSourceMimetype(sourceMimetype);

View File

@@ -374,6 +374,8 @@ public abstract class AbstractMetadataExtractor implements CustomTransformer
String className = this.getClass().getName();
String shortClassName = className.split("\\.")[className.split("\\.").length - 1];
shortClassName = shortClassName.replace('$', '-');
// The embedder uses the reverse of the extractor's data.
shortClassName = shortClassName.replace("Embedder", "Extractor");
return shortClassName + "_metadata_" + suffix + ".properties";
}

View File

@@ -0,0 +1,165 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.tika.embedder.Embedder;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Collections;
import java.util.Set;
import java.util.StringJoiner;
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
/**
* Sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add
* metadata. This is not production code, so no supported mimetypes exist in the {@code tika_engine_config.json}.
* Adding the following would make it available:
*
* <pre>
* {
* "transformOptions": {
* ...
* "metadataEmbedOptions": [
* {"value": {"name": "metadata", "required": true}}
* ]
* },
* "transformers": [
* ...
* {
* "transformerName": "PoiMetadataEmbedder",
* "supportedSourceAndTargetList": [
* ...
* {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
* ],
* "transformOptions": [
* "metadataEmbedOptions"
* ]
* }
* ]
* }
* </pre>
* @author Nick Burch
* @author Neil McErlean
* @author Dmitry Velichkevich
* @author adavis
*/
@Component
public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractor
{
private static final Logger logger = LoggerFactory.getLogger(PoiMetadataEmbedder.class);
public PoiMetadataEmbedder()
{
super(EXTRACTOR, logger);
}
@Override
protected Parser getParser()
{
return new OOXMLParser();
}
@Override
protected Embedder getEmbedder()
{
return new SamplePoiEmbedder();
}
private static class SamplePoiEmbedder implements Embedder
{
private static final Set<MediaType> SUPPORTED_EMBED_TYPES =
Collections.singleton(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
@Override
public Set<MediaType> getSupportedEmbedTypes(ParseContext parseContext)
{
return SUPPORTED_EMBED_TYPES;
}
@Override
public void embed(Metadata metadata, InputStream inputStream, OutputStream outputStream, ParseContext parseContext)
throws IOException
{
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
POIXMLProperties props = workbook.getProperties();
POIXMLProperties.CoreProperties coreProp = props.getCoreProperties();
POIXMLProperties.CustomProperties custProp = props.getCustomProperties();
for (String name : metadata.names())
{
metadata.isMultiValued("description");
String value = null;
if (metadata.isMultiValued(name))
{
String[] values = metadata.getValues(name);
StringJoiner sj = new StringJoiner(", ");
for (String s : values)
{
sj.add(s);
}
value = sj.toString();
}
else
{
value = metadata.get(name);
}
switch (name)
{
case "author":
coreProp.setCreator(value);
break;
case "title":
coreProp.setTitle(value);
break;
case "description":
coreProp.setDescription(value);
break;
// There are other core values but this is sample code, so we will assume it is a custom value.
default:
custProp.addProperty(name, value);
break;
}
}
workbook.write(outputStream);
}
}
}

View File

@@ -59,36 +59,6 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
* <b>created:</b> -- cm:created
* <b>Any custom property:</b> -- [not mapped]
* </pre>
*
* Uses Apache Tika
*
* Also includes a sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add
* metadata. This is not production code so no supported mimetypes exist in the {@code tika_engine_config.json}.
* Adding the following would make it available:
*
* <pre>
* {
* "transformOptions": {
* ...
* "metadataEmbedOptions": [
* {"value": {"name": "metadata", "required": true}}
* ]
* },
* "transformers": [
* ...
* {
* "transformerName": "SamplePoiMetadataEmbedder",
* "supportedSourceAndTargetList": [
* ...
* {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
* ],
* "transformOptions": [
* "metadataEmbedOptions"
* ]
* }
* ]
* }
* </pre>
* @author Nick Burch
* @author Neil McErlean
@@ -110,70 +80,4 @@ public class PoiMetadataExtractor extends AbstractTikaMetadataExtractor
{
return new OOXMLParser();
}
@Override
protected Embedder getEmbedder()
{
return new SamplePoiEmbedder();
}
private static class SamplePoiEmbedder implements Embedder
{
private static final Set<MediaType> SUPPORTED_EMBED_TYPES =
Collections.singleton(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
@Override
public Set<MediaType> getSupportedEmbedTypes(ParseContext parseContext)
{
return SUPPORTED_EMBED_TYPES;
}
@Override
public void embed(Metadata metadata, InputStream inputStream, OutputStream outputStream, ParseContext parseContext)
throws IOException
{
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
POIXMLProperties props = workbook.getProperties();
POIXMLProperties.CoreProperties coreProp = props.getCoreProperties();
POIXMLProperties.CustomProperties custProp = props.getCustomProperties();
for (String name : metadata.names())
{
metadata.isMultiValued("description");
String value = null;
if (metadata.isMultiValued(name))
{
String[] values = metadata.getValues(name);
StringJoiner sj = new StringJoiner(", ");
for (String s : values)
{
sj.add(s);
}
value = sj.toString();
}
else
{
value = metadata.get(name);
}
switch (name)
{
case "author":
coreProp.setCreator(value);
break;
case "title":
coreProp.setTitle(value);
break;
case "description":
coreProp.setDescription(value);
break;
// There are other core values but this is sample code, so we will assume it is a custom value.
default:
custProp.addProperty(name, value);
break;
}
}
workbook.write(outputStream);
}
}
}

View File

@@ -54,14 +54,12 @@ import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.URL;
import java.util.List;
@@ -103,7 +101,7 @@ public class Tika
public static final String PPTX = "pptx";
public static final String TXT = "txt";
public static final String XHTML = "xhtml";
public static final String XSLX = "xslx";
public static final String XLSX = "xlsx";
public static final String XML = "xml";
public static final String ZIP = "zip";
@@ -236,6 +234,10 @@ public class Tika
parser.parse(inputStream, handler, metadata, context);
}
catch (UnsupportedEncodingException e)
{
throw new IllegalStateException("Unsupported encoding "+e.getMessage(), e);
}
catch (SAXException | TikaException | IOException e)
{
throw new IllegalStateException(e.getMessage(), e);

View File

@@ -34,11 +34,9 @@ import org.alfresco.transform.client.model.TransformReply;
import org.alfresco.transform.client.model.TransformRequest;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mock;
import org.mockito.stubbing.Answer;
import org.springframework.core.io.FileSystemResource;
import org.springframework.core.io.Resource;
import org.springframework.http.HttpHeaders;
@@ -53,10 +51,8 @@ import javax.servlet.http.HttpServletRequest;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.UUID;
import static java.nio.file.Files.readAllBytes;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_METADATA_EMBED;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_PRESENTATION;
@@ -91,14 +87,13 @@ import static org.alfresco.transform.tika.transformers.Tika.TIKA_AUTO;
import static org.alfresco.transform.tika.transformers.Tika.TXT;
import static org.alfresco.transform.tika.transformers.Tika.XHTML;
import static org.alfresco.transform.tika.transformers.Tika.XML;
import static org.alfresco.transform.tika.transformers.Tika.XSLX;
import static org.alfresco.transform.tika.transformers.Tika.XLSX;
import static org.alfresco.transform.tika.transformers.Tika.ZIP;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyLong;
import static org.mockito.Mockito.when;
import static org.springframework.http.HttpHeaders.ACCEPT;
import static org.springframework.http.HttpHeaders.CONTENT_DISPOSITION;
@@ -109,7 +104,8 @@ import static org.springframework.http.HttpStatus.OK;
import static org.springframework.http.MediaType.APPLICATION_JSON_VALUE;
import static org.springframework.http.MediaType.APPLICATION_PDF_VALUE;
import static org.springframework.http.MediaType.TEXT_PLAIN_VALUE;
import static org.springframework.util.StringUtils.getFilenameExtension;
import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.asyncDispatch;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.request;
/**
* Test Tika.
@@ -179,7 +175,11 @@ public class TikaTest extends AbstractBaseTest
"targetExtension", this.targetExtension)
: mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile,
"targetExtension", this.targetExtension, INCLUDE_CONTENTS, includeContents.toString());
MvcResult result = mockMvc.perform(requestBuilder)
MvcResult mvcResult = mockMvc.perform(requestBuilder)
.andExpect(request().asyncStarted())
.andReturn();
MvcResult result = mockMvc.perform(asyncDispatch(mvcResult))
.andExpect(MockMvcResultMatchers.status().is(OK.value()))
.andExpect(MockMvcResultMatchers.header().string("Content-Disposition",
"attachment; filename*=UTF-8''transform." + this.targetExtension)).
@@ -252,8 +252,16 @@ public class TikaTest extends AbstractBaseTest
{
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
targetEncoding = "rubbish";
mockMvc.perform(
// mockMvc.perform(
// mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile, "targetExtension", targetExtension))
// .andExpect(MockMvcResultMatchers.status().is(INTERNAL_SERVER_ERROR.value()));
MvcResult mvcResult = mockMvc.perform(
mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile, "targetExtension", targetExtension))
.andExpect(request().asyncStarted())
.andReturn();
mockMvc.perform(asyncDispatch(mvcResult))
.andExpect(MockMvcResultMatchers.status().is(INTERNAL_SERVER_ERROR.value()));
}
@@ -381,7 +389,7 @@ public class TikaTest extends AbstractBaseTest
@Test
public void xslxToCsvPoiTest() throws Exception
{
transform(POI, XSLX, CSV, MIMETYPE_OPENXML_SPREADSHEET, MIMETYPE_TEXT_CSV, null,
transform(POI, XLSX, CSV, MIMETYPE_OPENXML_SPREADSHEET, MIMETYPE_TEXT_CSV, null,
EXPECTED_CSV_CONTENT_CONTAINS);
}
@@ -429,7 +437,7 @@ public class TikaTest extends AbstractBaseTest
@Test
public void xlsxEmbedTest() throws Exception
{
mockTransformCommand(XSLX, XSLX, MIMETYPE_OPENXML_SPREADSHEET, false);
mockTransformCommand(XLSX, XLSX, MIMETYPE_OPENXML_SPREADSHEET, false);
String metadata =
"{\"{http://www.alfresco.org/model/content/1.0}author\":\"author1\"," +
@@ -439,12 +447,16 @@ public class TikaTest extends AbstractBaseTest
MockHttpServletRequestBuilder requestBuilder =
super.mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile,
"targetExtension", XSLX,
"targetExtension", XLSX,
"metadata", metadata,
"targetMimetype", MIMETYPE_METADATA_EMBED,
"sourceMimetype", MIMETYPE_OPENXML_SPREADSHEET);
MvcResult result = mockMvc.perform(requestBuilder)
MvcResult mvcResult = mockMvc.perform(requestBuilder)
.andExpect(request().asyncStarted())
.andReturn();
MvcResult result = mockMvc.perform(asyncDispatch(mvcResult))
.andExpect(MockMvcResultMatchers.status().is(OK.value()))
.andExpect(MockMvcResultMatchers.header().string("Content-Disposition",
"attachment; filename*=UTF-8''transform." + targetExtension)).

View File

@@ -152,7 +152,7 @@ public class TikaTransformationIT
allTargets("quick.txt", "text/plain"),
allTargets("quick.vsd", "application/vnd.visio"),
allTargets("quick.xls", "application/vnd.ms-excel"),
allTargets("quick.xslx",
allTargets("quick.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
allTargets("quick.zip", "application/zip"),
allTargets("quick.tar", "application/x-tar"),

View File

@@ -987,7 +987,7 @@
]
},
{
"transformerName": "SamplePoiMetadataEmbedder",
"transformerName": "PoiMetadataEmbedder",
"supportedSourceAndTargetList": [
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
],

View File

@@ -58,6 +58,7 @@ import static org.alfresco.transform.common.Mimetype.MIMETYPE_VISIO_2013;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_WORD;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_WORDPERFECT;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML;
import static org.alfresco.transform.common.TransformerDebug.MIMETYPE_METADATA_EMBED;
import static org.alfresco.transform.common.TransformerDebug.MIMETYPE_METADATA_EXTRACT;
@@ -116,7 +117,8 @@ public class ExtensionService
Map.entry(MIMETYPE_DITA, "dita"),
Map.entry(MIMETYPE_OPENXML_SPREADSHEET_TEMPLATE, "xltx"),
Map.entry(MIMETYPE_IMAGE_SVG, "svg"),
Map.entry(MIMETYPE_TEXT_PLAIN, "txt")
Map.entry(MIMETYPE_TEXT_PLAIN, "txt"),
Map.entry(MIMETYPE_XHTML, "xhtml")
);
public static String getExtensionForTargetMimetype(String targetMimetype, String sourceMimetype)