REPO-4334 Move metadata extraction into T-Engines (#247)

* Metadata extract code added to T-Engines
* Required a refactor of duplicate code to avoid 3x more duplication:
        - try catches used to return return exit codes
        - calls to java libraries or commands to external processes
        - building of transform options in controllers, adaptors
* integration tests based on current extracts performed in the repo
* included extract code for libreoffice, and embed code even though not used out of the box any more. There may well be custom extracts using them that move to T-Engines
* removal of unused imports
* minor autoOrient / allowEnlargement bug fixes that were not included in Paddington on the T-Engine side.
This commit is contained in:
Alan Davis
2020-06-11 20:20:22 +01:00
committed by GitHub
parent ca394440bb
commit 06109dee75
158 changed files with 10288 additions and 1454 deletions

View File

@@ -26,10 +26,7 @@
*/
package org.alfresco.transformer;
import static org.alfresco.transformer.logging.StandardMessages.LICENCE;
import java.util.Arrays;
import io.micrometer.core.instrument.MeterRegistry;
import org.alfresco.transformer.executors.LibreOfficeJavaExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -43,7 +40,9 @@ import org.springframework.boot.context.event.ApplicationReadyEvent;
import org.springframework.context.annotation.Bean;
import org.springframework.context.event.EventListener;
import io.micrometer.core.instrument.MeterRegistry;
import java.util.Arrays;
import static org.alfresco.transformer.logging.StandardMessages.LICENCE;
@SpringBootApplication
@EnableAutoConfiguration(exclude = {DataSourceAutoConfiguration.class})
@@ -60,6 +59,9 @@ public class Application
return registry -> registry.config().commonTags("containerName", containerName);
}
// To run the LibreOffice T-Engine from the command line on a Mac, you generally need to
// install LibreOffice and add: -Dtransform.core.libreoffice.path=/Applications/LibreOffice.app/Contents/
// to the start up command.
public static void main(String[] args)
{
SpringApplication.run(Application.class, args);

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -26,37 +26,21 @@
*/
package org.alfresco.transformer;
import static org.alfresco.transformer.fs.FileManager.createAttachment;
import static org.alfresco.transformer.fs.FileManager.createSourceFile;
import static org.alfresco.transformer.fs.FileManager.createTargetFile;
import static org.alfresco.transformer.fs.FileManager.createTargetFileName;
import static org.springframework.http.HttpStatus.OK;
import static org.springframework.http.MediaType.MULTIPART_FORM_DATA_VALUE;
import java.io.File;
import java.util.Map;
import javax.annotation.PostConstruct;
import javax.servlet.http.HttpServletRequest;
import org.alfresco.transformer.executors.LibreOfficeJavaExecutor;
import org.alfresco.transformer.logging.LogEntry;
import org.alfresco.transformer.probes.ProbeTestTransform;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import javax.annotation.PostConstruct;
import java.io.File;
import java.util.Collections;
import java.util.Map;
/**
* Controller for the Docker based LibreOffice transformer.
*
*
* Status Codes:
*
* 200 Success
@@ -112,44 +96,23 @@ public class LibreOfficeController extends AbstractTransformerController
@Override
protected void executeTransformCommand(File sourceFile, File targetFile)
{
transform(null, null, null, Collections.emptyMap(), sourceFile, targetFile);
javaExecutor.call(sourceFile, targetFile);
}
};
}
//todo: the "timeout" request parameter is ignored; the timeout is preset at JodConverter creation
@PostMapping(value = "/transform", consumes = MULTIPART_FORM_DATA_VALUE)
public ResponseEntity<Resource> transform(HttpServletRequest request,
@RequestParam("file") MultipartFile sourceMultipartFile,
@RequestParam("targetExtension") String targetExtension,
@RequestParam(value = "timeout", required = false) Long timeout,
@RequestParam(value = "testDelay", required = false) Long testDelay)
@Override
protected String getTransformerName(final File sourceFile, final String sourceMimetype,
final String targetMimetype, final Map<String, String> transformOptions)
{
String targetFilename = createTargetFileName(sourceMultipartFile.getOriginalFilename(),
targetExtension);
getProbeTestTransform().incrementTransformerCount();
File sourceFile = createSourceFile(request, sourceMultipartFile);
File targetFile = createTargetFile(request, targetFilename);
// Both files are deleted by TransformInterceptor.afterCompletion
javaExecutor.call(sourceFile, targetFile);
final ResponseEntity<Resource> body = createAttachment(targetFilename, targetFile);
LogEntry.setTargetSize(targetFile.length());
long time = LogEntry.setStatusCodeAndMessage(OK.value(), "Success");
time += LogEntry.addDelay(testDelay);
getProbeTestTransform().recordTransformTime(time);
return body;
return null; // does not matter what value is returned, as it is not used because there is only one.
}
@Override
public void processTransform(final File sourceFile, final File targetFile,
final String sourceMimetype, final String targetMimetype,
final Map<String, String> transformOptions, final Long timeout)
protected void transform(String transformName, String sourceMimetype, String targetMimetype,
Map<String, String> transformOptions, File sourceFile, File targetFile)
{
logger.debug("Processing request with: sourceFile '{}', targetFile '{}', transformOptions" +
" '{}', timeout {} ms", sourceFile, targetFile, transformOptions, timeout);
javaExecutor.call(sourceFile, targetFile);
javaExecutor.transform(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
}
}

View File

@@ -0,0 +1 @@
#logging.level.org.alfresco.transformer.LibreOfficeController=debug

View File

@@ -6,6 +6,8 @@
<form method="POST" enctype="multipart/form-data" action="/transform">
<table>
<tr><td><div style="text-align:right">file *</div></td><td><input type="file" name="file" /></td></tr>
<tr><td><div style="text-align:right">sourceMimetype *</div></td><td><input type="text" name="sourceMimetype" value="" /></td></tr>
<tr><td><div style="text-align:right">targetMimetype</div></td><td><input type="text" name="targetMimetype" value="" /></td></tr>
<tr><td><div style="text-align:right">targetExtension *</div></td><td><input type="text" name="targetExtension" value="" /></td></tr>
<tr><td><div style="text-align:right">timeout</div></td><td><input type="text" name="timeout" value="" /></td></tr>
<tr><td><div style="text-align:right">testDelay</div></td><td><input type="text" name="testDelay" value="" /></td></tr>

View File

@@ -0,0 +1,211 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.executors;
import com.sun.star.beans.PropertyValue;
import com.sun.star.beans.UnknownPropertyException;
import com.sun.star.beans.XPropertySet;
import com.sun.star.document.XDocumentInfoSupplier;
import com.sun.star.frame.XComponentLoader;
import com.sun.star.io.IOException;
import com.sun.star.lang.IllegalArgumentException;
import com.sun.star.lang.WrappedTargetException;
import com.sun.star.lang.XComponent;
import com.sun.star.task.ErrorCodeIOException;
import com.sun.star.util.CloseVetoException;
import com.sun.star.util.XCloseable;
import com.sun.star.util.XRefreshable;
import org.artofsolving.jodconverter.office.OfficeContext;
import org.artofsolving.jodconverter.office.OfficeException;
import org.artofsolving.jodconverter.office.OfficeTask;
import java.io.File;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import static org.artofsolving.jodconverter.office.OfficeUtils.SERVICE_DESKTOP;
import static org.artofsolving.jodconverter.office.OfficeUtils.cast;
import static org.artofsolving.jodconverter.office.OfficeUtils.toUrl;
/**
* @deprecated The JodConverterMetadataExtracter has not been in use since 6.0.1
*
* Extracts values from Open Office documents into the following:
* <pre>
* <b>author:</b> -- cm:author
* <b>title:</b> -- cm:title
* <b>description:</b> -- cm:description
* </pre>
*
* @author Neil McErlean
* @author adavis
*/
@Deprecated
public class LibreOfficeExtractMetadataTask implements OfficeTask
{
/*
* These keys are used by Alfresco to map properties into a content model and do need to
* have lower-case initial letters.
*/
private static final String KEY_AUTHOR = "author";
private static final String KEY_TITLE = "title";
private static final String KEY_DESCRIPTION = "description";
private File inputFile;
private Map<String, Serializable> metadata = new HashMap<String, Serializable>();
public LibreOfficeExtractMetadataTask(File inputFile)
{
this.inputFile = inputFile;
}
public Map<String, Serializable> getMetadata()
{
return metadata;
}
public void execute(OfficeContext context)
{
XComponent document = null;
try
{
if (!inputFile.exists())
{
throw new OfficeException("input document not found");
}
XComponentLoader loader = cast(XComponentLoader.class, context
.getService(SERVICE_DESKTOP));
// Need to set the Hidden property to ensure that OOo GUI does not appear.
PropertyValue hiddenOOo = new PropertyValue();
hiddenOOo.Name = "Hidden";
hiddenOOo.Value = Boolean.TRUE;
PropertyValue readOnly = new PropertyValue();
readOnly.Name = "ReadOnly";
readOnly.Value = Boolean.TRUE;
try
{
// TODO The following call fails. Not debugged why as it appears this extractor is not used any more.
document = loader.loadComponentFromURL(toUrl(inputFile), "_blank", 0,
new PropertyValue[]{hiddenOOo, readOnly});
}
catch (IllegalArgumentException illegalArgumentException)
{
throw new OfficeException("could not load document: "
+ inputFile.getName(), illegalArgumentException);
}
catch (ErrorCodeIOException errorCodeIOException)
{
throw new OfficeException("could not load document: "
+ inputFile.getName() + "; errorCode: "
+ errorCodeIOException.ErrCode, errorCodeIOException);
}
catch (IOException ioException)
{
throw new OfficeException("could not load document: "
+ inputFile.getName(), ioException);
}
if (document == null)
{
throw new OfficeException("could not load document: "
+ inputFile.getName());
}
XRefreshable refreshable = cast(XRefreshable.class, document);
if (refreshable != null)
{
refreshable.refresh();
}
XDocumentInfoSupplier docInfoSupplier = cast(XDocumentInfoSupplier.class, document);
XPropertySet propSet = cast(XPropertySet.class, docInfoSupplier.getDocumentInfo());
// The strings below are property names as used by OOo. They need upper-case
// initial letters.
Object author = getPropertyValueIfAvailable(propSet, "Author");
Object description = getPropertyValueIfAvailable(propSet, "Subject");
Object title = getPropertyValueIfAvailable(propSet, "Title");
metadata = new HashMap<String, Serializable>(3);
metadata.put(KEY_AUTHOR, author == null ? null : author.toString());
metadata.put(KEY_DESCRIPTION, description == null ? null : description.toString());
metadata.put(KEY_TITLE, title == null ? null : title.toString());
}
catch (OfficeException officeException)
{
throw officeException;
}
catch (Exception exception)
{
throw new OfficeException("conversion failed", exception);
}
finally
{
if (document != null)
{
XCloseable closeable = cast(XCloseable.class, document);
if (closeable != null)
{
try
{
closeable.close(true);
}
catch (CloseVetoException closeVetoException)
{
// whoever raised the veto should close the document
}
}
else
{
document.dispose();
}
}
}
}
/**
* OOo throws exceptions if we ask for properties that aren't there, so we'll tread carefully.
*
* @param propSet
* @param propertyName property name as used by the OOo API.
* @throws UnknownPropertyException
* @throws WrappedTargetException
*/
private Object getPropertyValueIfAvailable(XPropertySet propSet, String propertyName)
throws UnknownPropertyException, WrappedTargetException
{
if (propSet.getPropertySetInfo().hasPropertyByName(propertyName))
{
return propSet.getPropertyValue(propertyName);
}
else
{
return null;
}
}
}

View File

@@ -26,12 +26,8 @@
*/
package org.alfresco.transformer.executors;
import static org.springframework.http.HttpStatus.BAD_REQUEST;
import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
import java.io.File;
import java.io.IOException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.sun.star.task.ErrorCodeIOException;
import org.alfresco.transform.exceptions.TransformException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@@ -42,7 +38,13 @@ import org.artofsolving.jodconverter.office.OfficeManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.sun.star.task.ErrorCodeIOException;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
import static org.springframework.http.HttpStatus.BAD_REQUEST;
import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
/**
* JavaExecutor implementation for running LibreOffice transformations. It loads the
@@ -50,6 +52,8 @@ import com.sun.star.task.ErrorCodeIOException;
*/
public class LibreOfficeJavaExecutor implements JavaExecutor
{
private static String ID = "libreoffice";
private static final Logger logger = LoggerFactory.getLogger(LibreOfficeJavaExecutor.class);
private static final int JODCONVERTER_TRANSFORMATION_ERROR_CODE = 3088;
@@ -58,7 +62,9 @@ public class LibreOfficeJavaExecutor implements JavaExecutor
public static final String LICENCE = "This transformer uses LibreOffice from The Document Foundation. See the license at https://www.libreoffice.org/download/license/ or in /libreoffice.txt";
private JodConverter jodconverter;
private final JodConverter jodconverter;
private final ObjectMapper jsonObjectMapper = new ObjectMapper();
public LibreOfficeJavaExecutor(String path)
{
@@ -76,7 +82,7 @@ public class LibreOfficeJavaExecutor implements JavaExecutor
final JodConverterSharedInstance jodconverter = new JodConverterSharedInstance();
jodconverter.setOfficeHome(LIBREOFFICE_HOME); // jodconverter.officeHome
jodconverter.setOfficeHome(LIBREOFFICE_HOME); // jodconverter.officeHome
jodconverter.setMaxTasksPerProcess("200"); // jodconverter.maxTasksPerProcess
jodconverter.setTaskExecutionTimeout(timeout); // jodconverter.maxTaskExecutionTimeout
jodconverter.setTaskQueueTimeout(timeout); // jodconverter.taskQueueTimeout
@@ -89,6 +95,19 @@ public class LibreOfficeJavaExecutor implements JavaExecutor
return jodconverter;
}
@Override
public String getTransformerId()
{
return ID;
}
@Override
public void transform(String transformName, String sourceMimetype, String targetMimetype, Map<String, String> transformOptions,
File sourceFile, File targetFile)
{
call(sourceFile, targetFile);
}
@Override
public void call(File sourceFile, File targetFile, String... args)
{
@@ -147,7 +166,7 @@ public class LibreOfficeJavaExecutor implements JavaExecutor
PDPage pdfPage = new PDPage();
try (PDDocument pdfDoc = new PDDocument();
PDPageContentStream contentStream = new PDPageContentStream(pdfDoc, pdfPage))
PDPageContentStream ignore = new PDPageContentStream(pdfDoc, pdfPage))
{
// Even though, we want an empty PDF, some libs (e.g. PDFRenderer) object to PDFs
// that have literally nothing in them. So we'll put a content stream in it.
@@ -162,4 +181,48 @@ public class LibreOfficeJavaExecutor implements JavaExecutor
"Error creating empty PDF file", iox);
}
}
/**
* @deprecated The JodConverterMetadataExtracter has not been in use since 6.0.1.
* This code exists in case there are custom implementations, that need to be converted to T-Engines.
* It is simply a copy and paste from the content repository and has received limited testing.
*/
@Override
public void extractMetadata(String transformName, String sourceMimetype, String targetMimetype,
Map<String, String> transformOptions,
File sourceFile, File targetFile)
{
OfficeManager officeManager = jodconverter.getOfficeManager();
LibreOfficeExtractMetadataTask extractMetadataTask = new LibreOfficeExtractMetadataTask(sourceFile);
try
{
officeManager.execute(extractMetadataTask);
}
catch (OfficeException e)
{
throw new TransformException(BAD_REQUEST.value(),
"LibreOffice metadata extract failed: \n" +
" from file: " + sourceFile, e);
}
Map<String, Serializable> metadata = extractMetadataTask.getMetadata();
if (logger.isDebugEnabled())
{
metadata.forEach((k,v) -> logger.debug(k+"="+v));
}
writeMetadataIntoTargetFile(targetFile, metadata);
}
private void writeMetadataIntoTargetFile(File targetFile, Map<String, Serializable> results)
{
try
{
jsonObjectMapper.writeValue(targetFile, results);
}
catch (IOException e)
{
throw new TransformException(INTERNAL_SERVER_ERROR.value(), "Failed to write metadata to targetFile", e);
}
}
}