REPO-5191 Bug: T-Engine should provide mapping rather than the repo. (#316)

Bug found while reviewing documents on how to create a custom metadata extractor. The original refactor had left the repo doing the mapping. It should have been passing the fully qualified repo properties to the T-Engine to do the mapping.

Linked to:
    Alfresco/alfresco-community-repo#227
    Alfresco/acs-packaging#1826
This commit is contained in:
Alan Davis 2021-01-06 22:25:40 +00:00 committed by GitHub
parent 6da39399db
commit 2fd11d5aed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 267 additions and 14 deletions

View File

@ -83,4 +83,12 @@ public class AIOControllerTikaTest extends TikaControllerTest
// Ignore the test in super class as the way the AIO transformer provides config is fundamentally different.
}
@Test
@Override
public void xlsxEmbedTest()
{
// Ignore the test in super class as the way the AIO transformer provides config is fundamentally different.
// It uses the real class path rather than the test one.
}
}

View File

@ -13,6 +13,10 @@
],
"metadataOptions": [
{"value": {"name": "extractMapping"}}
],
"metadataEmbedOptions": [
{"value": {"name": "metadata", "required": true}},
{"value": {"name": "targetEncoding"}}
]
},
"transformers": [
@ -981,6 +985,15 @@
"transformOptions": [
"metadataOptions"
]
},
{
"transformerName": "SamplePoiMetadataEmbedder",
"supportedSourceAndTargetList": [
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
],
"transformOptions": [
"metadataEmbedOptions"
]
}
]
}

View File

@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@ -32,6 +32,8 @@ import org.alfresco.transformer.executors.RuntimeExec;
import org.alfresco.transformer.model.FileRefEntity;
import org.alfresco.transformer.model.FileRefResponse;
import org.alfresco.transformer.probes.ProbeTestTransform;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
@ -51,6 +53,7 @@ import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilde
import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
import javax.servlet.http.HttpServletRequest;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
@ -79,6 +82,7 @@ import static org.alfresco.transformer.executors.Tika.XML;
import static org.alfresco.transformer.executors.Tika.XSLX;
import static org.alfresco.transformer.executors.Tika.ZIP;
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_HTML;
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_METADATA_EMBED;
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_OPENXML_PRESENTATION;
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET;
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING;
@ -539,6 +543,43 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
EXPECTED_TEXT_CONTENT_CONTAINS);
}
@Test
public void xlsxEmbedTest() throws Exception
{
mockTransformCommand(XSLX, XSLX, MIMETYPE_OPENXML_SPREADSHEET, false);
String metadata =
"{\"{http://www.alfresco.org/model/content/1.0}author\":\"author1\"," +
"\"{http://www.alfresco.org/model/content/1.0}title\":\"title1\"," +
"\"{http://www.alfresco.org/model/content/1.0}description\":[\"desc1\",\"desc2\"]," +
"\"{http://www.alfresco.org/model/content/1.0}created\":\"created1\"}";
MockHttpServletRequestBuilder requestBuilder =
super.mockMvcRequest("/transform", sourceFile,
"targetExtension", XSLX,
"metadata", metadata,
"targetMimetype", MIMETYPE_METADATA_EMBED,
"sourceMimetype", MIMETYPE_OPENXML_SPREADSHEET);
MvcResult result = mockMvc.perform(requestBuilder)
.andExpect(status().is(OK.value()))
.andExpect(header().string("Content-Disposition",
"attachment; filename*= UTF-8''quick." + targetExtension)).
andReturn();
byte[] bytes = result.getResponse().getContentAsByteArray();
ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
POIXMLProperties props = workbook.getProperties();
POIXMLProperties.CoreProperties coreProp = props.getCoreProperties();
POIXMLProperties.CustomProperties custProp = props.getCustomProperties();
assertEquals("author1", coreProp.getCreator());
assertEquals("title1", coreProp.getTitle());
assertEquals("desc1, desc2", coreProp.getDescription()); // multi value
assertEquals("created1", custProp.getProperty("created").getLpwstr());
}
@Test
public void pdfToTxtExtractBookmarksTest() throws Exception
{

View File

@ -13,6 +13,10 @@
],
"metadataOptions": [
{"value": {"name": "extractMapping"}}
],
"metadataEmbedOptions": [
{"value": {"name": "metadata", "required": true}},
{"value": {"name": "targetEncoding"}}
]
},
"transformers": [
@ -981,6 +985,15 @@
"transformOptions": [
"metadataOptions"
]
},
{
"transformerName": "SamplePoiMetadataEmbedder",
"supportedSourceAndTargetList": [
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
],
"transformOptions": [
"metadataEmbedOptions"
]
}
]
}

View File

@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@ -44,7 +44,6 @@ import org.xml.sax.SAXException;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Map;
import java.util.StringJoiner;
@ -81,6 +80,7 @@ public class TikaJavaExecutor implements JavaExecutor
.build();
private final Map<String, AbstractTikaMetadataExtractor> metadataEmbedder = ImmutableMap
.<String, AbstractTikaMetadataExtractor>builder()
.put("SamplePoiMetadataEmbedder", new PoiMetadataExtractor())
.build();
public TikaJavaExecutor()
@ -119,7 +119,7 @@ public class TikaJavaExecutor implements JavaExecutor
}
@Override
public void call(File sourceFile, File targetFile, String... args) throws Exception
public void call(File sourceFile, File targetFile, String... args)
{
args = buildArgs(sourceFile, targetFile, args);
tika.transform(args);

View File

@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@ -53,6 +53,7 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Locale;
@ -316,9 +317,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
return;
}
Metadata metadataToEmbed = new Metadata();
Map<String, String> metadataAsStrings = getMetadata(transformOptions);
metadataAsStrings.forEach((k,v)->metadataToEmbed.add(k, v));
Metadata metadataToEmbed = getTikaMetadata(transformOptions);
try (InputStream inputStream = new FileInputStream(sourceFile);
OutputStream outputStream = new FileOutputStream(targetFile))
@ -327,6 +326,46 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
}
}
private Metadata getTikaMetadata(Map<String, String> transformOptions)
{
Metadata metadataToEmbed = new Metadata();
Map<String, Serializable> properties = getMetadata(transformOptions);
for (String metadataKey : properties.keySet())
{
Serializable value = properties.get(metadataKey);
if (value == null)
{
continue;
}
if (value instanceof Collection<?>)
{
for (Object singleValue : (Collection<?>) value)
{
try
{
metadataToEmbed.add(metadataKey, (String)singleValue);
}
catch (ClassCastException e)
{
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
}
}
}
else
{
try
{
metadataToEmbed.add(metadataKey, (String)value);
}
catch (ClassCastException e)
{
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
}
}
}
return metadataToEmbed;
}
private Serializable getMetadataValues(Metadata metadata, String key)
{
// Use Set to prevent duplicates.

View File

@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@ -26,11 +26,24 @@
*/
package org.alfresco.transformer.metadataExtractors;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.tika.embedder.Embedder;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Collections;
import java.util.Set;
import java.util.StringJoiner;
/**
* POI-based metadata extractor for Office 07 documents. See http://poi.apache.org/ for information on POI.
*
@ -46,6 +59,34 @@ import org.slf4j.LoggerFactory;
*
* Uses Apache Tika
*
* Also includes a sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add
* metadata. This is not production code so no supported mimetypes exist in the {@code tika_engine_config.json}.
* Adding the following would make it available:
*
* <pre>
* {
* "transformOptions": {
* ...
* "metadataEmbedOptions": [
* {"value": {"name": "metadata", "required": true}}
* ]
* },
* "transformers": [
* ...
* {
* "transformerName": "SamplePoiMetadataEmbedder",
* "supportedSourceAndTargetList": [
* ...
* {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
* ],
* "transformOptions": [
* "metadataEmbedOptions"
* ]
* }
* ]
* }
* </pre>
* @author Nick Burch
* @author Neil McErlean
* @author Dmitry Velichkevich
@ -65,4 +106,70 @@ public class PoiMetadataExtractor extends AbstractTikaMetadataExtractor
{
return new OOXMLParser();
}
@Override
protected Embedder getEmbedder()
{
return new SamplePoiEmbedder();
}
private static class SamplePoiEmbedder implements Embedder
{
private static final Set<MediaType> SUPPORTED_EMBED_TYPES =
Collections.singleton(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
@Override
public Set<MediaType> getSupportedEmbedTypes(ParseContext parseContext)
{
return SUPPORTED_EMBED_TYPES;
}
@Override
public void embed(Metadata metadata, InputStream inputStream, OutputStream outputStream, ParseContext parseContext)
throws IOException
{
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
POIXMLProperties props = workbook.getProperties();
POIXMLProperties.CoreProperties coreProp = props.getCoreProperties();
POIXMLProperties.CustomProperties custProp = props.getCustomProperties();
for (String name : metadata.names())
{
metadata.isMultiValued("description");
String value = null;
if (metadata.isMultiValued(name))
{
String[] values = metadata.getValues(name);
StringJoiner sj = new StringJoiner(", ");
for (String s : values)
{
sj.add(s);
}
value = sj.toString();
}
else
{
value = metadata.get(name);
}
switch (name)
{
case "author":
coreProp.setCreator(value);
break;
case "title":
coreProp.setTitle(value);
break;
case "description":
coreProp.setDescription(value);
break;
// There are other core values but this is sample code, so we will assume it is a custom value.
default:
custProp.addProperty(name, value);
break;
}
}
workbook.write(outputStream);
}
}
}

View File

@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005-2020 Alfresco Software Limited
* Copyright (C) 2005-2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@ -140,7 +140,7 @@ public abstract class AbstractMetadataExtractor
// Default nothing, as embedding is not supported in most cases
}
protected Map<String, String> getMetadata(Map<String, String> transformOptions)
protected Map<String, Serializable> getMetadata(Map<String, String> transformOptions)
{
String metadataAsJson = transformOptions.get(METADATA);
if (metadataAsJson == null)
@ -150,8 +150,10 @@ public abstract class AbstractMetadataExtractor
try
{
TypeReference<HashMap<String, String>> typeRef = new TypeReference<>() {};
return jsonObjectMapper.readValue(metadataAsJson, typeRef);
TypeReference<HashMap<String, Serializable>> typeRef = new TypeReference<>() {};
HashMap<String, Serializable> systemProperties = jsonObjectMapper.readValue(metadataAsJson, typeRef);
Map<String, Serializable> rawProperties = mapSystemToRaw(systemProperties);
return rawProperties;
}
catch (JsonProcessingException e)
{
@ -159,6 +161,36 @@ public abstract class AbstractMetadataExtractor
}
}
private Map<String, Serializable> mapSystemToRaw(Map<String, Serializable> systemMetadata)
{
Map<String, Serializable> metadataProperties = new HashMap<>(systemMetadata.size() * 2 + 1);
for (Map.Entry<String, Serializable> entry : systemMetadata.entrySet())
{
String modelProperty = entry.getKey();
// Check if there is a mapping for this
if (!embedMapping.containsKey(modelProperty))
{
// No mapping - ignore
continue;
}
Serializable documentValue = entry.getValue();
Set<String> metadataKeys = embedMapping.get(modelProperty);
for (String metadataKey : metadataKeys)
{
metadataProperties.put(metadataKey, documentValue);
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug(
"Converted system model values to metadata values: \n" +
" System Properties: " + systemMetadata + "\n" +
" Metadata Properties: " + metadataProperties);
}
return metadataProperties;
}
protected Map<String, Set<String>> getExtractMapping()
{
return Collections.unmodifiableMap(extractMapping.get());