REPO-4334 Move metadata extraction into T-Engines (#247)

* Metadata extract code added to T-Engines
* Required a refactor of duplicate code to avoid 3x more duplication:
        - try catches used to return return exit codes
        - calls to java libraries or commands to external processes
        - building of transform options in controllers, adaptors
* integration tests based on current extracts performed in the repo
* included extract code for libreoffice, and embed code even though not used out of the box any more. There may well be custom extracts using them that move to T-Engines
* removal of unused imports
* minor autoOrient / allowEnlargement bug fixes that were not included in Paddington on the T-Engine side.
This commit is contained in:
Alan Davis
2020-06-11 20:20:22 +01:00
committed by GitHub
parent ca394440bb
commit 06109dee75
158 changed files with 10288 additions and 1454 deletions

View File

@@ -0,0 +1,211 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.executors;
import com.sun.star.beans.PropertyValue;
import com.sun.star.beans.UnknownPropertyException;
import com.sun.star.beans.XPropertySet;
import com.sun.star.document.XDocumentInfoSupplier;
import com.sun.star.frame.XComponentLoader;
import com.sun.star.io.IOException;
import com.sun.star.lang.IllegalArgumentException;
import com.sun.star.lang.WrappedTargetException;
import com.sun.star.lang.XComponent;
import com.sun.star.task.ErrorCodeIOException;
import com.sun.star.util.CloseVetoException;
import com.sun.star.util.XCloseable;
import com.sun.star.util.XRefreshable;
import org.artofsolving.jodconverter.office.OfficeContext;
import org.artofsolving.jodconverter.office.OfficeException;
import org.artofsolving.jodconverter.office.OfficeTask;
import java.io.File;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import static org.artofsolving.jodconverter.office.OfficeUtils.SERVICE_DESKTOP;
import static org.artofsolving.jodconverter.office.OfficeUtils.cast;
import static org.artofsolving.jodconverter.office.OfficeUtils.toUrl;
/**
* @deprecated The JodConverterMetadataExtracter has not been in use since 6.0.1
*
* Extracts values from Open Office documents into the following:
* <pre>
* <b>author:</b> -- cm:author
* <b>title:</b> -- cm:title
* <b>description:</b> -- cm:description
* </pre>
*
* @author Neil McErlean
* @author adavis
*/
@Deprecated
public class LibreOfficeExtractMetadataTask implements OfficeTask
{
/*
* These keys are used by Alfresco to map properties into a content model and do need to
* have lower-case initial letters.
*/
private static final String KEY_AUTHOR = "author";
private static final String KEY_TITLE = "title";
private static final String KEY_DESCRIPTION = "description";
private File inputFile;
private Map<String, Serializable> metadata = new HashMap<String, Serializable>();
public LibreOfficeExtractMetadataTask(File inputFile)
{
this.inputFile = inputFile;
}
public Map<String, Serializable> getMetadata()
{
return metadata;
}
public void execute(OfficeContext context)
{
XComponent document = null;
try
{
if (!inputFile.exists())
{
throw new OfficeException("input document not found");
}
XComponentLoader loader = cast(XComponentLoader.class, context
.getService(SERVICE_DESKTOP));
// Need to set the Hidden property to ensure that OOo GUI does not appear.
PropertyValue hiddenOOo = new PropertyValue();
hiddenOOo.Name = "Hidden";
hiddenOOo.Value = Boolean.TRUE;
PropertyValue readOnly = new PropertyValue();
readOnly.Name = "ReadOnly";
readOnly.Value = Boolean.TRUE;
try
{
// TODO The following call fails. Not debugged why as it appears this extractor is not used any more.
document = loader.loadComponentFromURL(toUrl(inputFile), "_blank", 0,
new PropertyValue[]{hiddenOOo, readOnly});
}
catch (IllegalArgumentException illegalArgumentException)
{
throw new OfficeException("could not load document: "
+ inputFile.getName(), illegalArgumentException);
}
catch (ErrorCodeIOException errorCodeIOException)
{
throw new OfficeException("could not load document: "
+ inputFile.getName() + "; errorCode: "
+ errorCodeIOException.ErrCode, errorCodeIOException);
}
catch (IOException ioException)
{
throw new OfficeException("could not load document: "
+ inputFile.getName(), ioException);
}
if (document == null)
{
throw new OfficeException("could not load document: "
+ inputFile.getName());
}
XRefreshable refreshable = cast(XRefreshable.class, document);
if (refreshable != null)
{
refreshable.refresh();
}
XDocumentInfoSupplier docInfoSupplier = cast(XDocumentInfoSupplier.class, document);
XPropertySet propSet = cast(XPropertySet.class, docInfoSupplier.getDocumentInfo());
// The strings below are property names as used by OOo. They need upper-case
// initial letters.
Object author = getPropertyValueIfAvailable(propSet, "Author");
Object description = getPropertyValueIfAvailable(propSet, "Subject");
Object title = getPropertyValueIfAvailable(propSet, "Title");
metadata = new HashMap<String, Serializable>(3);
metadata.put(KEY_AUTHOR, author == null ? null : author.toString());
metadata.put(KEY_DESCRIPTION, description == null ? null : description.toString());
metadata.put(KEY_TITLE, title == null ? null : title.toString());
}
catch (OfficeException officeException)
{
throw officeException;
}
catch (Exception exception)
{
throw new OfficeException("conversion failed", exception);
}
finally
{
if (document != null)
{
XCloseable closeable = cast(XCloseable.class, document);
if (closeable != null)
{
try
{
closeable.close(true);
}
catch (CloseVetoException closeVetoException)
{
// whoever raised the veto should close the document
}
}
else
{
document.dispose();
}
}
}
}
/**
* OOo throws exceptions if we ask for properties that aren't there, so we'll tread carefully.
*
* @param propSet
* @param propertyName property name as used by the OOo API.
* @throws UnknownPropertyException
* @throws WrappedTargetException
*/
private Object getPropertyValueIfAvailable(XPropertySet propSet, String propertyName)
throws UnknownPropertyException, WrappedTargetException
{
if (propSet.getPropertySetInfo().hasPropertyByName(propertyName))
{
return propSet.getPropertyValue(propertyName);
}
else
{
return null;
}
}
}

View File

@@ -26,12 +26,8 @@
*/
package org.alfresco.transformer.executors;
import static org.springframework.http.HttpStatus.BAD_REQUEST;
import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
import java.io.File;
import java.io.IOException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.sun.star.task.ErrorCodeIOException;
import org.alfresco.transform.exceptions.TransformException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@@ -42,7 +38,13 @@ import org.artofsolving.jodconverter.office.OfficeManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.sun.star.task.ErrorCodeIOException;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
import static org.springframework.http.HttpStatus.BAD_REQUEST;
import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
/**
* JavaExecutor implementation for running LibreOffice transformations. It loads the
@@ -50,6 +52,8 @@ import com.sun.star.task.ErrorCodeIOException;
*/
public class LibreOfficeJavaExecutor implements JavaExecutor
{
private static String ID = "libreoffice";
private static final Logger logger = LoggerFactory.getLogger(LibreOfficeJavaExecutor.class);
private static final int JODCONVERTER_TRANSFORMATION_ERROR_CODE = 3088;
@@ -58,7 +62,9 @@ public class LibreOfficeJavaExecutor implements JavaExecutor
public static final String LICENCE = "This transformer uses LibreOffice from The Document Foundation. See the license at https://www.libreoffice.org/download/license/ or in /libreoffice.txt";
private JodConverter jodconverter;
private final JodConverter jodconverter;
private final ObjectMapper jsonObjectMapper = new ObjectMapper();
public LibreOfficeJavaExecutor(String path)
{
@@ -76,7 +82,7 @@ public class LibreOfficeJavaExecutor implements JavaExecutor
final JodConverterSharedInstance jodconverter = new JodConverterSharedInstance();
jodconverter.setOfficeHome(LIBREOFFICE_HOME); // jodconverter.officeHome
jodconverter.setOfficeHome(LIBREOFFICE_HOME); // jodconverter.officeHome
jodconverter.setMaxTasksPerProcess("200"); // jodconverter.maxTasksPerProcess
jodconverter.setTaskExecutionTimeout(timeout); // jodconverter.maxTaskExecutionTimeout
jodconverter.setTaskQueueTimeout(timeout); // jodconverter.taskQueueTimeout
@@ -89,6 +95,19 @@ public class LibreOfficeJavaExecutor implements JavaExecutor
return jodconverter;
}
@Override
public String getTransformerId()
{
return ID;
}
@Override
public void transform(String transformName, String sourceMimetype, String targetMimetype, Map<String, String> transformOptions,
File sourceFile, File targetFile)
{
call(sourceFile, targetFile);
}
@Override
public void call(File sourceFile, File targetFile, String... args)
{
@@ -147,7 +166,7 @@ public class LibreOfficeJavaExecutor implements JavaExecutor
PDPage pdfPage = new PDPage();
try (PDDocument pdfDoc = new PDDocument();
PDPageContentStream contentStream = new PDPageContentStream(pdfDoc, pdfPage))
PDPageContentStream ignore = new PDPageContentStream(pdfDoc, pdfPage))
{
// Even though, we want an empty PDF, some libs (e.g. PDFRenderer) object to PDFs
// that have literally nothing in them. So we'll put a content stream in it.
@@ -162,4 +181,48 @@ public class LibreOfficeJavaExecutor implements JavaExecutor
"Error creating empty PDF file", iox);
}
}
/**
* @deprecated The JodConverterMetadataExtracter has not been in use since 6.0.1.
* This code exists in case there are custom implementations, that need to be converted to T-Engines.
* It is simply a copy and paste from the content repository and has received limited testing.
*/
@Override
public void extractMetadata(String transformName, String sourceMimetype, String targetMimetype,
Map<String, String> transformOptions,
File sourceFile, File targetFile)
{
OfficeManager officeManager = jodconverter.getOfficeManager();
LibreOfficeExtractMetadataTask extractMetadataTask = new LibreOfficeExtractMetadataTask(sourceFile);
try
{
officeManager.execute(extractMetadataTask);
}
catch (OfficeException e)
{
throw new TransformException(BAD_REQUEST.value(),
"LibreOffice metadata extract failed: \n" +
" from file: " + sourceFile, e);
}
Map<String, Serializable> metadata = extractMetadataTask.getMetadata();
if (logger.isDebugEnabled())
{
metadata.forEach((k,v) -> logger.debug(k+"="+v));
}
writeMetadataIntoTargetFile(targetFile, metadata);
}
private void writeMetadataIntoTargetFile(File targetFile, Map<String, Serializable> results)
{
try
{
jsonObjectMapper.writeValue(targetFile, results);
}
catch (IOException e)
{
throw new TransformException(INTERNAL_SERVER_ERROR.value(), "Failed to write metadata to targetFile", e);
}
}
}