mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-05-12 17:04:48 +00:00
REPO-5191 Bug: T-Engine should provide mapping rather than the repo. (#316)
Bug found while reviewing documents on how to create a custom metadata extractor. The original refactor had left the repo doing the mapping. It should have been passing the fully qualified repo properties to the T-Engine to do the mapping. Linked to: Alfresco/alfresco-community-repo#227 Alfresco/acs-packaging#1826
This commit is contained in:
parent
6da39399db
commit
2fd11d5aed
@ -75,7 +75,7 @@ public class AIOControllerTikaTest extends TikaControllerTest
|
||||
// Ignore the test in super class as the way the AIO transformer provides config is fundamentally different.
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void testGetInfoFromConfigWithNoTransformOptions()
|
||||
@ -83,4 +83,12 @@ public class AIOControllerTikaTest extends TikaControllerTest
|
||||
// Ignore the test in super class as the way the AIO transformer provides config is fundamentally different.
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void xlsxEmbedTest()
|
||||
{
|
||||
// Ignore the test in super class as the way the AIO transformer provides config is fundamentally different.
|
||||
// It uses the real class path rather than the test one.
|
||||
}
|
||||
}
|
@ -13,6 +13,10 @@
|
||||
],
|
||||
"metadataOptions": [
|
||||
{"value": {"name": "extractMapping"}}
|
||||
],
|
||||
"metadataEmbedOptions": [
|
||||
{"value": {"name": "metadata", "required": true}},
|
||||
{"value": {"name": "targetEncoding"}}
|
||||
]
|
||||
},
|
||||
"transformers": [
|
||||
@ -981,6 +985,15 @@
|
||||
"transformOptions": [
|
||||
"metadataOptions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"transformerName": "SamplePoiMetadataEmbedder",
|
||||
"supportedSourceAndTargetList": [
|
||||
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
|
||||
],
|
||||
"transformOptions": [
|
||||
"metadataEmbedOptions"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@ -32,6 +32,8 @@ import org.alfresco.transformer.executors.RuntimeExec;
|
||||
import org.alfresco.transformer.model.FileRefEntity;
|
||||
import org.alfresco.transformer.model.FileRefResponse;
|
||||
import org.alfresco.transformer.probes.ProbeTestTransform;
|
||||
import org.apache.poi.ooxml.POIXMLProperties;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
@ -51,6 +53,7 @@ import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilde
|
||||
import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
|
||||
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
@ -79,6 +82,7 @@ import static org.alfresco.transformer.executors.Tika.XML;
|
||||
import static org.alfresco.transformer.executors.Tika.XSLX;
|
||||
import static org.alfresco.transformer.executors.Tika.ZIP;
|
||||
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_HTML;
|
||||
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_METADATA_EMBED;
|
||||
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_OPENXML_PRESENTATION;
|
||||
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET;
|
||||
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING;
|
||||
@ -539,6 +543,43 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xlsxEmbedTest() throws Exception
|
||||
{
|
||||
mockTransformCommand(XSLX, XSLX, MIMETYPE_OPENXML_SPREADSHEET, false);
|
||||
|
||||
String metadata =
|
||||
"{\"{http://www.alfresco.org/model/content/1.0}author\":\"author1\"," +
|
||||
"\"{http://www.alfresco.org/model/content/1.0}title\":\"title1\"," +
|
||||
"\"{http://www.alfresco.org/model/content/1.0}description\":[\"desc1\",\"desc2\"]," +
|
||||
"\"{http://www.alfresco.org/model/content/1.0}created\":\"created1\"}";
|
||||
|
||||
MockHttpServletRequestBuilder requestBuilder =
|
||||
super.mockMvcRequest("/transform", sourceFile,
|
||||
"targetExtension", XSLX,
|
||||
"metadata", metadata,
|
||||
"targetMimetype", MIMETYPE_METADATA_EMBED,
|
||||
"sourceMimetype", MIMETYPE_OPENXML_SPREADSHEET);
|
||||
|
||||
MvcResult result = mockMvc.perform(requestBuilder)
|
||||
.andExpect(status().is(OK.value()))
|
||||
.andExpect(header().string("Content-Disposition",
|
||||
"attachment; filename*= UTF-8''quick." + targetExtension)).
|
||||
andReturn();
|
||||
|
||||
byte[] bytes = result.getResponse().getContentAsByteArray();
|
||||
ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
|
||||
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
|
||||
POIXMLProperties props = workbook.getProperties();
|
||||
POIXMLProperties.CoreProperties coreProp = props.getCoreProperties();
|
||||
POIXMLProperties.CustomProperties custProp = props.getCustomProperties();
|
||||
|
||||
assertEquals("author1", coreProp.getCreator());
|
||||
assertEquals("title1", coreProp.getTitle());
|
||||
assertEquals("desc1, desc2", coreProp.getDescription()); // multi value
|
||||
assertEquals("created1", custProp.getProperty("created").getLpwstr());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pdfToTxtExtractBookmarksTest() throws Exception
|
||||
{
|
||||
|
@ -13,6 +13,10 @@
|
||||
],
|
||||
"metadataOptions": [
|
||||
{"value": {"name": "extractMapping"}}
|
||||
],
|
||||
"metadataEmbedOptions": [
|
||||
{"value": {"name": "metadata", "required": true}},
|
||||
{"value": {"name": "targetEncoding"}}
|
||||
]
|
||||
},
|
||||
"transformers": [
|
||||
@ -981,6 +985,15 @@
|
||||
"transformOptions": [
|
||||
"metadataOptions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"transformerName": "SamplePoiMetadataEmbedder",
|
||||
"supportedSourceAndTargetList": [
|
||||
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
|
||||
],
|
||||
"transformOptions": [
|
||||
"metadataEmbedOptions"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@ -44,7 +44,6 @@ import org.xml.sax.SAXException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
import java.util.StringJoiner;
|
||||
@ -81,6 +80,7 @@ public class TikaJavaExecutor implements JavaExecutor
|
||||
.build();
|
||||
private final Map<String, AbstractTikaMetadataExtractor> metadataEmbedder = ImmutableMap
|
||||
.<String, AbstractTikaMetadataExtractor>builder()
|
||||
.put("SamplePoiMetadataEmbedder", new PoiMetadataExtractor())
|
||||
.build();
|
||||
|
||||
public TikaJavaExecutor()
|
||||
@ -119,7 +119,7 @@ public class TikaJavaExecutor implements JavaExecutor
|
||||
}
|
||||
|
||||
@Override
|
||||
public void call(File sourceFile, File targetFile, String... args) throws Exception
|
||||
public void call(File sourceFile, File targetFile, String... args)
|
||||
{
|
||||
args = buildArgs(sourceFile, targetFile, args);
|
||||
tika.transform(args);
|
||||
|
@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@ -53,6 +53,7 @@ import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Locale;
|
||||
@ -316,9 +317,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
return;
|
||||
}
|
||||
|
||||
Metadata metadataToEmbed = new Metadata();
|
||||
Map<String, String> metadataAsStrings = getMetadata(transformOptions);
|
||||
metadataAsStrings.forEach((k,v)->metadataToEmbed.add(k, v));
|
||||
Metadata metadataToEmbed = getTikaMetadata(transformOptions);
|
||||
|
||||
try (InputStream inputStream = new FileInputStream(sourceFile);
|
||||
OutputStream outputStream = new FileOutputStream(targetFile))
|
||||
@ -327,6 +326,46 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
}
|
||||
}
|
||||
|
||||
private Metadata getTikaMetadata(Map<String, String> transformOptions)
|
||||
{
|
||||
Metadata metadataToEmbed = new Metadata();
|
||||
Map<String, Serializable> properties = getMetadata(transformOptions);
|
||||
for (String metadataKey : properties.keySet())
|
||||
{
|
||||
Serializable value = properties.get(metadataKey);
|
||||
if (value == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (value instanceof Collection<?>)
|
||||
{
|
||||
for (Object singleValue : (Collection<?>) value)
|
||||
{
|
||||
try
|
||||
{
|
||||
metadataToEmbed.add(metadataKey, (String)singleValue);
|
||||
}
|
||||
catch (ClassCastException e)
|
||||
{
|
||||
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
try
|
||||
{
|
||||
metadataToEmbed.add(metadataKey, (String)value);
|
||||
}
|
||||
catch (ClassCastException e)
|
||||
{
|
||||
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
return metadataToEmbed;
|
||||
}
|
||||
|
||||
private Serializable getMetadataValues(Metadata metadata, String key)
|
||||
{
|
||||
// Use Set to prevent duplicates.
|
||||
|
@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@ -26,11 +26,24 @@
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
|
||||
import org.apache.poi.ooxml.POIXMLProperties;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.apache.tika.embedder.Embedder;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
/**
|
||||
* POI-based metadata extractor for Office 07 documents. See http://poi.apache.org/ for information on POI.
|
||||
*
|
||||
@ -46,6 +59,34 @@ import org.slf4j.LoggerFactory;
|
||||
*
|
||||
* Uses Apache Tika
|
||||
*
|
||||
* Also includes a sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add
|
||||
* metadata. This is not production code so no supported mimetypes exist in the {@code tika_engine_config.json}.
|
||||
* Adding the following would make it available:
|
||||
*
|
||||
* <pre>
|
||||
* {
|
||||
* "transformOptions": {
|
||||
* ...
|
||||
* "metadataEmbedOptions": [
|
||||
* {"value": {"name": "metadata", "required": true}}
|
||||
* ]
|
||||
* },
|
||||
* "transformers": [
|
||||
* ...
|
||||
* {
|
||||
* "transformerName": "SamplePoiMetadataEmbedder",
|
||||
* "supportedSourceAndTargetList": [
|
||||
* ...
|
||||
* {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
|
||||
* ],
|
||||
* "transformOptions": [
|
||||
* "metadataEmbedOptions"
|
||||
* ]
|
||||
* }
|
||||
* ]
|
||||
* }
|
||||
* </pre>
|
||||
|
||||
* @author Nick Burch
|
||||
* @author Neil McErlean
|
||||
* @author Dmitry Velichkevich
|
||||
@ -65,4 +106,70 @@ public class PoiMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
return new OOXMLParser();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Embedder getEmbedder()
|
||||
{
|
||||
return new SamplePoiEmbedder();
|
||||
}
|
||||
|
||||
private static class SamplePoiEmbedder implements Embedder
|
||||
{
|
||||
private static final Set<MediaType> SUPPORTED_EMBED_TYPES =
|
||||
Collections.singleton(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
|
||||
|
||||
@Override
|
||||
public Set<MediaType> getSupportedEmbedTypes(ParseContext parseContext)
|
||||
{
|
||||
return SUPPORTED_EMBED_TYPES;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void embed(Metadata metadata, InputStream inputStream, OutputStream outputStream, ParseContext parseContext)
|
||||
throws IOException
|
||||
{
|
||||
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
|
||||
POIXMLProperties props = workbook.getProperties();
|
||||
|
||||
POIXMLProperties.CoreProperties coreProp = props.getCoreProperties();
|
||||
POIXMLProperties.CustomProperties custProp = props.getCustomProperties();
|
||||
|
||||
for (String name : metadata.names())
|
||||
{
|
||||
metadata.isMultiValued("description");
|
||||
String value = null;
|
||||
if (metadata.isMultiValued(name))
|
||||
{
|
||||
String[] values = metadata.getValues(name);
|
||||
StringJoiner sj = new StringJoiner(", ");
|
||||
for (String s : values)
|
||||
{
|
||||
sj.add(s);
|
||||
}
|
||||
value = sj.toString();
|
||||
}
|
||||
else
|
||||
{
|
||||
value = metadata.get(name);
|
||||
}
|
||||
switch (name)
|
||||
{
|
||||
case "author":
|
||||
coreProp.setCreator(value);
|
||||
break;
|
||||
case "title":
|
||||
coreProp.setTitle(value);
|
||||
break;
|
||||
case "description":
|
||||
coreProp.setDescription(value);
|
||||
break;
|
||||
// There are other core values but this is sample code, so we will assume it is a custom value.
|
||||
default:
|
||||
custProp.addProperty(name, value);
|
||||
break;
|
||||
}
|
||||
}
|
||||
workbook.write(outputStream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005-2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005-2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@ -140,7 +140,7 @@ public abstract class AbstractMetadataExtractor
|
||||
// Default nothing, as embedding is not supported in most cases
|
||||
}
|
||||
|
||||
protected Map<String, String> getMetadata(Map<String, String> transformOptions)
|
||||
protected Map<String, Serializable> getMetadata(Map<String, String> transformOptions)
|
||||
{
|
||||
String metadataAsJson = transformOptions.get(METADATA);
|
||||
if (metadataAsJson == null)
|
||||
@ -150,8 +150,10 @@ public abstract class AbstractMetadataExtractor
|
||||
|
||||
try
|
||||
{
|
||||
TypeReference<HashMap<String, String>> typeRef = new TypeReference<>() {};
|
||||
return jsonObjectMapper.readValue(metadataAsJson, typeRef);
|
||||
TypeReference<HashMap<String, Serializable>> typeRef = new TypeReference<>() {};
|
||||
HashMap<String, Serializable> systemProperties = jsonObjectMapper.readValue(metadataAsJson, typeRef);
|
||||
Map<String, Serializable> rawProperties = mapSystemToRaw(systemProperties);
|
||||
return rawProperties;
|
||||
}
|
||||
catch (JsonProcessingException e)
|
||||
{
|
||||
@ -159,6 +161,36 @@ public abstract class AbstractMetadataExtractor
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, Serializable> mapSystemToRaw(Map<String, Serializable> systemMetadata)
|
||||
{
|
||||
Map<String, Serializable> metadataProperties = new HashMap<>(systemMetadata.size() * 2 + 1);
|
||||
for (Map.Entry<String, Serializable> entry : systemMetadata.entrySet())
|
||||
{
|
||||
String modelProperty = entry.getKey();
|
||||
// Check if there is a mapping for this
|
||||
if (!embedMapping.containsKey(modelProperty))
|
||||
{
|
||||
// No mapping - ignore
|
||||
continue;
|
||||
}
|
||||
Serializable documentValue = entry.getValue();
|
||||
Set<String> metadataKeys = embedMapping.get(modelProperty);
|
||||
for (String metadataKey : metadataKeys)
|
||||
{
|
||||
metadataProperties.put(metadataKey, documentValue);
|
||||
}
|
||||
}
|
||||
// Done
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug(
|
||||
"Converted system model values to metadata values: \n" +
|
||||
" System Properties: " + systemMetadata + "\n" +
|
||||
" Metadata Properties: " + metadataProperties);
|
||||
}
|
||||
return metadataProperties;
|
||||
}
|
||||
|
||||
protected Map<String, Set<String>> getExtractMapping()
|
||||
{
|
||||
return Collections.unmodifiableMap(extractMapping.get());
|
||||
|
Loading…
x
Reference in New Issue
Block a user