ATS-671: Split engines into fat & skinny modules (ATS-674) (#192)

Each transform engine project has been separated into 2 modules so that an executable and non-executable jar can be created. 
Modules have been renamed such that *docker* has been removed from the artifactIds and project names.

Co-authored-by: Erik Knizat <erik.knizat@alfresco.com>
Co-authored-by: David Edwards <david.edwards@alfresco.com>
This commit is contained in:
eknizat
2020-03-27 13:45:15 +00:00
committed by GitHub
parent 46b2e6df5b
commit 3bed6930bf
215 changed files with 539 additions and 157 deletions

View File

@@ -0,0 +1,9 @@
### Licenses
* htmlparser http://htmlparser.sourceforge.net/license.html
* commons-compress http://jakarta.apache.org/commons/
* pdfbox-tools http://pdfbox.apache.org/
* poi-ooxml http://poi.apache.org/
* commons-compress, PDFBox and poi-ooxml are from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0 or the
[Apache 2.0.txt](https://github.com/Alfresco/acs-community-packaging/blob/master/distribution/src/main/resources/licenses/3rd-party/Apache%202.0.txt)
file placed in the root directory of the docker image.

View File

@@ -0,0 +1,91 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>alfresco-transform-misc</artifactId>
<name>Alfresco Miscellaneous Transformers</name>
<packaging>jar</packaging>
<parent>
<artifactId>alfresco-transform-core</artifactId>
<groupId>org.alfresco</groupId>
<version>2.2.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<dependencies>
<dependency>
<groupId>org.alfresco</groupId>
<artifactId>alfresco-transformer-base</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
<!-- HtmlParserContentTransformer -->
<dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>2.1</version>
</dependency>
<!-- AppleIWorksContentTransformer -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</dependency>
<!-- TextToPdfContentTransformer -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>2.0.19</version>
</dependency>
<!-- OOXMLThumbnailContentTransformer -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>${dependency.poi.version}</version>
</dependency>
<!-- EMLTransformer -->
<dependency>
<groupId>com.sun.mail</groupId>
<artifactId>javax.mail</artifactId>
<version>1.6.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>license-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-failsafe-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,116 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_IMAGE_JPEG;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.List;
import java.util.Map;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.ImmutableList;
/**
* Converts Apple iWorks files to JPEGs for thumbnailing and previewing.
* The transformer will only work for iWorks 2013/14 files. Support for iWorks 2008/9 has been dropped as we cannot
* support both, because the newer format does not contain a PDF. If we say this transformer supports PDF, Share will
* assume incorrectly that we can convert to PDF and we would only get a preview for the older format and never the
* newer one. Both formats have the same mimetype.
*
* <p>
* This code is based on a class of the same name originally implemented in alfresco-repository.
* </p>
*
* @author Neil Mc Erlean
* @author eknizat
* @since 4.0
*/
public class AppleIWorksContentTransformer implements SelectableTransformer
{
private static final Logger logger = LoggerFactory.getLogger(
AppleIWorksContentTransformer.class);
// Apple's zip entry names for previews in iWorks have changed over time.
private static final List<String> PDF_PATHS = ImmutableList.of(
"QuickLook/Preview.pdf"); // iWorks 2008/9
private static final List<String> JPG_PATHS = ImmutableList.of(
"QuickLook/Thumbnail.jpg", // iWorks 2008/9
"preview.jpg"); // iWorks 2013/14 (720 x 552) We use the best quality image. Others are:
// (225 x 173) preview-web.jpg
// (53 x 41) preview-micro.jpg
@Override
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
final String targetMimetype, final Map<String, String> parameters)
{
logger.debug("Performing IWorks to jpeg transform with sourceMimetype={} targetMimetype={}",
sourceMimetype, targetMimetype);
// iWorks files are zip (or package) files.
// If it's not a zip file, the resultant ZipException will be caught as an IOException below.
try (ZipArchiveInputStream iWorksZip = new ZipArchiveInputStream(
new BufferedInputStream(new FileInputStream(sourceFile))))
{
// Look through the zip file entries for the preview/thumbnail.
List<String> paths = MIMETYPE_IMAGE_JPEG.equals(targetMimetype) ? JPG_PATHS : PDF_PATHS;
ZipArchiveEntry entry;
boolean found = false;
while ((entry = iWorksZip.getNextZipEntry()) != null)
{
String name = entry.getName();
if (paths.contains(name))
{
Files.copy(iWorksZip, targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
found = true;
break;
}
}
if (!found)
{
throw new RuntimeException(
"The source " + sourceMimetype + " file did not contain a " + targetMimetype + " preview");
}
}
catch (IOException e)
{
throw new RuntimeException(
"Unable to transform " + sourceMimetype + " file. It should have been a zip format file.",
e);
}
}
}

View File

@@ -0,0 +1,233 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_MULTIPART_ALTERNATIVE;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_TEXT_PLAIN;
import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Map;
import java.util.Properties;
import javax.mail.MessagingException;
import javax.mail.Multipart;
import javax.mail.Part;
import javax.mail.Session;
import javax.mail.internet.MimeMessage;
import org.alfresco.transformer.fs.FileManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Uses javax.mail.MimeMessage to generate plain text versions of RFC822 email
* messages. Searches for all text content parts, and returns them. Any
* attachments are ignored. TIKA Note - could be replaced with the Tika email
* parser. Would require a recursing parser to be specified, but not the full
* Auto one (we don't want attachments), just one containing text and html
* related parsers.
*
* <p>
* This code is based on a class of the same name originally implemented in alfresco-repository.
* </p>
*/
public class EMLTransformer implements SelectableTransformer
{
private static final Logger logger = LoggerFactory.getLogger(EMLTransformer.class);
private static final String CHARSET = "charset";
private static final String DEFAULT_ENCODING = "UTF-8";
@Override
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
final String targetMimetype, final Map<String, String> parameters) throws Exception
{
logger.debug("Performing RFC822 to text transform.");
// Use try with resource
try (InputStream contentInputStream = new BufferedInputStream(
new FileInputStream(sourceFile));
Writer bufferedFileWriter = new BufferedWriter(new FileWriter(targetFile)))
{
MimeMessage mimeMessage = new MimeMessage(Session.getDefaultInstance(new Properties()),
contentInputStream);
final StringBuilder sb = new StringBuilder();
Object content = mimeMessage.getContent();
if (content instanceof Multipart)
{
processMultiPart((Multipart) content, sb);
}
else
{
sb.append(content.toString());
}
bufferedFileWriter.write(sb.toString());
}
}
/**
* Find "text" parts of message recursively and appends it to sb StringBuilder
*
* @param multipart Multipart to process
* @param sb StringBuilder
* @throws MessagingException
* @throws IOException
*/
private void processMultiPart(Multipart multipart, StringBuilder sb) throws MessagingException,
IOException
{
boolean isAlternativeMultipart = multipart.getContentType().contains(
MIMETYPE_MULTIPART_ALTERNATIVE);
if (isAlternativeMultipart)
{
processAlternativeMultipart(multipart, sb);
}
else
{
for (int i = 0, n = multipart.getCount(); i < n; i++)
{
Part part = multipart.getBodyPart(i);
if (part.getContent() instanceof Multipart)
{
processMultiPart((Multipart) part.getContent(), sb);
}
else
{
processPart(part, sb);
}
}
}
}
/**
* Finds the suitable part from an multipart/alternative and appends it's text content to StringBuilder sb
*
* @param multipart
* @param sb
* @throws IOException
* @throws MessagingException
*/
private void processAlternativeMultipart(Multipart multipart, StringBuilder sb) throws
IOException, MessagingException
{
Part partToUse = null;
for (int i = 0, n = multipart.getCount(); i < n; i++)
{
Part part = multipart.getBodyPart(i);
if (part.getContentType().contains(MIMETYPE_TEXT_PLAIN))
{
partToUse = part;
break;
}
else if (part.getContentType().contains(MIMETYPE_HTML))
{
partToUse = part;
}
else if (part.getContentType().contains(MIMETYPE_MULTIPART_ALTERNATIVE))
{
if (part.getContent() instanceof Multipart)
{
processAlternativeMultipart((Multipart) part.getContent(), sb);
}
}
}
if (partToUse != null)
{
processPart(partToUse, sb);
}
}
/**
* Finds text on a given mail part. Accepted parts types are text/html and text/plain.
* Attachments are ignored
*
* @param part
* @param sb
* @throws IOException
* @throws MessagingException
*/
private void processPart(Part part, StringBuilder sb) throws IOException, MessagingException
{
boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition());
if (isAttachment)
{
return;
}
if (part.getContentType().contains(MIMETYPE_TEXT_PLAIN))
{
sb.append(part.getContent().toString());
}
else if (part.getContentType().contains(MIMETYPE_HTML))
{
String mailPartContent = part.getContent().toString();
//create a temporary html file with same mail part content and encoding
File tempHtmlFile = FileManager.TempFileProvider.createTempFile("EMLTransformer_",
".html");
String encoding = getMailPartContentEncoding(part);
try (OutputStreamWriter osWriter = new OutputStreamWriter(
new FileOutputStream(tempHtmlFile), encoding))
{
osWriter.write(mailPartContent);
}
//transform html file's content to plain text
HtmlParserContentTransformer.EncodingAwareStringBean extractor = new HtmlParserContentTransformer.EncodingAwareStringBean();
extractor.setCollapse(false);
extractor.setLinks(false);
extractor.setReplaceNonBreakingSpaces(false);
extractor.setURL(tempHtmlFile, encoding);
sb.append(extractor.getStrings());
tempHtmlFile.delete();
}
}
private String getMailPartContentEncoding(Part part) throws MessagingException
{
String encoding = DEFAULT_ENCODING;
String contentType = part.getContentType();
int startIndex = contentType.indexOf(CHARSET);
if (startIndex > 0)
{
encoding = contentType.substring(startIndex + CHARSET.length() + 1)
.replaceAll("\"", "");
}
return encoding;
}
}

View File

@@ -0,0 +1,190 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.Map;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Content transformer which wraps the HTML Parser library for
* parsing HTML content.
*
* <p>
* This code is based on a class of the same name originally implemented in alfresco-repository.
* </p>
*
* <p>
* Since HTML Parser was updated from v1.6 to v2.1, META tags
* defining an encoding for the content via http-equiv=Content-Type
* will ONLY be respected if the encoding of the content item
* itself is set to ISO-8859-1.
* </p>
*
* <p>
* Tika Note - could be converted to use the Tika HTML parser,
* but we'd potentially need a custom text handler to replicate
* the current settings around links and non-breaking spaces.
* </p>
*
* @author Derek Hulley
* @author eknizat
* @see <a href="http://htmlparser.sourceforge.net/">http://htmlparser.sourceforge.net</a>
* @see org.htmlparser.beans.StringBean
* @see <a href="http://sourceforge.net/tracker/?func=detail&aid=1644504&group_id=24399&atid=381401">HTML Parser</a>
*/
public class HtmlParserContentTransformer implements SelectableTransformer
{
private static final Logger logger = LoggerFactory.getLogger(
HtmlParserContentTransformer.class);
@Override
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
final String targetMimetype, final Map<String, String> parameters) throws Exception
{
String sourceEncoding = parameters.get(SOURCE_ENCODING);
checkEncodingParameter(sourceEncoding, SOURCE_ENCODING);
if (logger.isDebugEnabled())
{
logger.debug("Performing HTML to text transform with sourceEncoding=" + sourceEncoding);
}
// Create the extractor
EncodingAwareStringBean extractor = new EncodingAwareStringBean();
extractor.setCollapse(false);
extractor.setLinks(false);
extractor.setReplaceNonBreakingSpaces(false);
extractor.setURL(sourceFile, sourceEncoding);
// get the text
String text = extractor.getStrings();
// write it to the writer
try (Writer writer = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(targetFile))))
{
writer.write(text);
}
}
private void checkEncodingParameter(String encoding, String parameterName)
{
try
{
if (encoding != null && !Charset.isSupported(encoding))
{
throw new IllegalArgumentException(
parameterName + "=" + encoding + " is not supported by the JVM.");
}
}
catch (IllegalCharsetNameException e)
{
throw new IllegalArgumentException(
parameterName + "=" + encoding + " is not a valid encoding.");
}
}
/**
* <p>
* This code is based on a class of the same name, originally implemented in alfresco-repository.
* </p>
*
* A version of {@link StringBean} which allows control of the
* encoding in the underlying HTML Parser.
* Unfortunately, StringBean doesn't allow easy over-riding of
* this, so we have to duplicate some code to control this.
* This allows us to correctly handle HTML files where the encoding
* is specified against the content property (rather than in the
* HTML Head Meta), see ALF-10466 for details.
*/
public static class EncodingAwareStringBean extends StringBean
{
private static final long serialVersionUID = -9033414360428669553L;
/**
* Sets the File to extract strings from, and the encoding
* it's in (if known to Alfresco)
*
* @param file The File that text should be fetched from.
* @param encoding The encoding of the input
*/
public void setURL(File file, String encoding)
{
String previousURL = getURL();
String newURL = file.getAbsolutePath();
if (previousURL == null || !newURL.equals(previousURL))
{
try
{
URLConnection conn = getConnection();
if (null == mParser)
{
mParser = new Parser(newURL);
}
else
{
mParser.setURL(newURL);
}
if (encoding != null)
{
mParser.setEncoding(encoding);
}
mPropertySupport.firePropertyChange(StringBean.PROP_URL_PROPERTY, previousURL,
getURL());
mPropertySupport.firePropertyChange(StringBean.PROP_CONNECTION_PROPERTY, conn,
mParser.getConnection());
setStrings();
}
catch (ParserException pe)
{
updateStrings(pe.toString());
}
}
}
public String getEncoding()
{
return mParser.getEncoding();
}
}
}

View File

@@ -0,0 +1,130 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.Map;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Extracts out Thumbnail JPEGs from OOXML files for thumbnailing and previewing.
* This transformer will only work for OOXML files where thumbnailing was enabled,
* which isn't on by default on Windows, but is more common on Mac.
*
* <p>
* This code is based on a class of the same name originally implemented in alfresco-repository.
* </p>
*
* @author Nick Burch
* @author eknizat
*/
public class OOXMLThumbnailContentTransformer implements SelectableTransformer
{
private static final Logger logger = LoggerFactory.getLogger(
OOXMLThumbnailContentTransformer.class);
@Override
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
final String targetMimetype, final Map<String, String> parameters) throws Exception
{
if (logger.isDebugEnabled())
{
logger.debug("Performing OOXML to jpeg transform with sourceMimetype=" + sourceMimetype
+ " targetMimetype=" + targetMimetype);
}
try (OPCPackage pkg = OPCPackage.open(sourceFile.getPath()))
{
// Does it have a thumbnail?
PackageRelationshipCollection rels = pkg.getRelationshipsByType(
PackageRelationshipTypes.THUMBNAIL);
if (rels.size() > 0)
{
// Get the thumbnail part
PackageRelationship tRel = rels.getRelationship(0);
PackagePart tPart = pkg.getPart(tRel);
// Write it to the target
InputStream tStream = tPart.getInputStream();
Files.copy(tStream, targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
tStream.close();
}
else
{
logger.debug("No thumbnail present in file.");
throw new Exception(
"No thumbnail present in file, unable to generate " + targetMimetype);
}
}
catch (IOException e)
{
throw new RuntimeException("Unable to transform file.", e);
}
}
/*
// TODO Add this back to engine_config.json when the transformer is fixed for java 11
{
"transformerName": "ooxmlThumbnail",
"supportedSourceAndTargetList": [
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.ms-word.document.macroenabled.12", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.template", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.ms-word.template.macroenabled.12", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.ms-powerpoint.presentation.macroenabled.12", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.presentationml.template", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.ms-powerpoint.template.macroenabled.12", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.ms-powerpoint.addin.macroenabled.12", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.presentationml.slide", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.ms-powerpoint.slide.macroenabled.12", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.template", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.ms-excel.sheet.macroenabled.12", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.ms-excel.template.macroenabled.12", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.ms-excel.addin.macroenabled.12", "targetMediaType": "image/jpeg"},
{"sourceMediaType": "application/vnd.ms-excel.sheet.binary.macroenabled.12", "targetMediaType": "image/jpeg"}
],
"transformOptions": [
]
}
*/
}

View File

@@ -0,0 +1,52 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import java.io.File;
import java.util.Map;
/**
* Implemented by transformers used by {@link SelectingTransformer}.
*
* @author eknizat
*/
public interface SelectableTransformer
{
String SOURCE_ENCODING = "sourceEncoding";
String TARGET_ENCODING = "targetEncoding";
/**
* Implementation of the actual transformation.
*
* @param sourceFile
* @param targetFile
* @param parameters
* @throws Exception
*/
void transform(File sourceFile, File targetFile, String sourceMimetype,
String targetMimetype, Map<String, String> parameters) throws Exception;
}

View File

@@ -0,0 +1,123 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import static org.springframework.http.HttpStatus.BAD_REQUEST;
import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
import java.io.File;
import java.util.Map;
import java.util.StringJoiner;
import org.alfresco.transform.exceptions.TransformException;
import org.alfresco.transformer.logging.LogEntry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.ImmutableMap;
/**
* The SelectingTransformer selects a registered {@link SelectableTransformer}
* and delegates the transformation to its implementation.
*
* @author eknizat
*/
public class SelectingTransformer
{
private static final Logger logger = LoggerFactory.getLogger(SelectingTransformer.class);
private final Map<String, SelectableTransformer> transformers = ImmutableMap
.<String, SelectableTransformer>builder()
.put("appleIWorks", new AppleIWorksContentTransformer())
.put("html", new HtmlParserContentTransformer())
.put("string", new StringExtractingContentTransformer())
.put("textToPdf", new TextToPdfContentTransformer())
.put("rfc822", new EMLTransformer())
.put("ooXmlThumbnail", new OOXMLThumbnailContentTransformer())
.build();
/**
* Performs a transform using a transformer selected based on the provided sourceMimetype and targetMimetype
*
* @param transform the name of the transformer
* @param sourceFile File to transform from
* @param targetFile File to transform to
* @param sourceMimetype Mimetype of the source file
* @throws TransformException if there was a problem internally
*/
public void transform(String transform, File sourceFile, File targetFile, String sourceMimetype,
String targetMimetype, Map<String, String> parameters) throws TransformException
{
try
{
final SelectableTransformer transformer = transformers.get(transform);
logOptions(sourceFile, targetFile, parameters);
transformer.transform(sourceFile, targetFile, sourceMimetype, targetMimetype,
parameters);
}
catch (IllegalArgumentException e)
{
throw new TransformException(BAD_REQUEST.value(), getMessage(e));
}
catch (Exception e)
{
throw new TransformException(INTERNAL_SERVER_ERROR.value(), getMessage(e));
}
if (!targetFile.exists())
{
throw new TransformException(INTERNAL_SERVER_ERROR.value(),
"Transformer failed to create an output file. Target file does not exist.");
}
if (sourceFile.length() > 0 && targetFile.length() == 0)
{
throw new TransformException(INTERNAL_SERVER_ERROR.value(),
"Transformer failed to create an output file. Target file is empty but source file was not empty.");
}
}
private static String getMessage(Exception e)
{
return e.getMessage() == null || e.getMessage().isEmpty() ? e.getClass().getSimpleName() : e.getMessage();
}
private static void logOptions(File sourceFile, File targetFile, Map<String, String> parameters)
{
StringJoiner sj = new StringJoiner(" ");
parameters.forEach((k, v) -> sj.add(
"--" + k + "=" + v)); // keeping the existing style used in other T-Engines
sj.add(getExtension(sourceFile));
sj.add(getExtension(targetFile));
LogEntry.setOptions(sj.toString());
}
private static String getExtension(File file)
{
final String name = file.getName();
int i = name.lastIndexOf('.');
return i == -1 ? "???" : name.substring(i + 1);
}
}

View File

@@ -0,0 +1,155 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Converts any textual format to plain text.
* <p>
* The transformation is sensitive to the source and target string encodings.
*
*
* <p>
* This code is based on a class of the same name originally implemented in alfresco-repository.
* </p>
*
* @author Derek Hulley
* @author eknizat
*/
public class StringExtractingContentTransformer implements SelectableTransformer
{
private static final Log logger = LogFactory.getLog(StringExtractingContentTransformer.class);
/**
* Text to text conversions are done directly using the content reader and writer string
* manipulation methods.
* <p>
* Extraction of text from binary content attempts to take the possible character
* encoding into account. The text produced from this will, if the encoding was correct,
* be unformatted but valid.
*/
@Override
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
final String targetMimetype, final Map<String, String> parameters) throws Exception
{
String sourceEncoding = parameters.get(SOURCE_ENCODING);
String targetEncoding = parameters.get(TARGET_ENCODING);
if (logger.isDebugEnabled())
{
logger.debug("Performing text to text transform with sourceEncoding=" + sourceEncoding
+ " targetEncoding=" + targetEncoding);
}
Reader charReader = null;
Writer charWriter = null;
try
{
// Build reader
if (sourceEncoding == null)
{
charReader = new BufferedReader(
new InputStreamReader(new FileInputStream(sourceFile)));
}
else
{
checkEncodingParameter(sourceEncoding, SOURCE_ENCODING);
charReader = new BufferedReader(
new InputStreamReader(new FileInputStream(sourceFile), sourceEncoding));
}
// Build writer
if (targetEncoding == null)
{
charWriter = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(targetFile)));
}
else
{
checkEncodingParameter(targetEncoding, TARGET_ENCODING);
charWriter = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(targetFile), targetEncoding));
}
// copy from the one to the other
char[] buffer = new char[8192];
int readCount = 0;
while (readCount > -1)
{
// write the last read count number of bytes
charWriter.write(buffer, 0, readCount);
// fill the buffer again
readCount = charReader.read(buffer);
}
}
finally
{
if (charReader != null)
{
try { charReader.close(); } catch (Throwable e) { logger.error(e); }
}
if (charWriter != null)
{
try { charWriter.close(); } catch (Throwable e) { logger.error(e); }
}
}
// done
}
private void checkEncodingParameter(String encoding, String paramterName)
{
try
{
if (!Charset.isSupported(encoding))
{
throw new IllegalArgumentException(
paramterName + "=" + encoding + " is not supported by the JVM.");
}
}
catch (IllegalCharsetNameException e)
{
throw new IllegalArgumentException(
paramterName + "=" + encoding + " is not a valid encoding.");
}
}
}

View File

@@ -0,0 +1,323 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.tools.TextToPDF;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* <p>
* This code is based on a class of the same name originally implemented in alfresco-repository.
* </p>
*
* Makes use of the <a href="http://www.pdfbox.org/">PDFBox</a> library's <code>TextToPDF</code> utility.
*
* @author Derek Hulley
* @author eknizat
*/
public class TextToPdfContentTransformer implements SelectableTransformer
{
private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class);
public static final String PAGE_LIMIT = "pageLimit";
private final PagedTextToPDF transformer;
public TextToPdfContentTransformer()
{
transformer = new PagedTextToPDF();
}
public void setStandardFont(String fontName)
{
try
{
transformer.setFont(PagedTextToPDF.getStandardFont(fontName));
}
catch (Throwable e)
{
throw new RuntimeException(
"Unable to set Standard Font for PDF generation: " + fontName, e);
}
}
public void setFontSize(int fontSize)
{
try
{
transformer.setFontSize(fontSize);
}
catch (Throwable e)
{
throw new RuntimeException(
"Unable to set Font Size for PDF generation: " + fontSize);
}
}
@Override
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
final String targetMimetype, final Map<String, String> parameters) throws Exception
{
String sourceEncoding = parameters.get(SOURCE_ENCODING);
String stringPageLimit = parameters.get(PAGE_LIMIT);
int pageLimit = -1;
if (stringPageLimit != null)
{
pageLimit = parseInt(stringPageLimit, PAGE_LIMIT);
}
PDDocument pdf = null;
try (InputStream is = new FileInputStream(sourceFile);
Reader ir = new BufferedReader(buildReader(is, sourceEncoding));
OutputStream os = new BufferedOutputStream(new FileOutputStream(targetFile)))
{
//TransformationOptionLimits limits = getLimits(reader, writer, options);
//TransformationOptionPair pageLimits = limits.getPagesPair();
pdf = transformer.createPDFFromText(ir, pageLimit);
pdf.save(os);
}
finally
{
if (pdf != null)
{
try { pdf.close(); } catch (Throwable e) {e.printStackTrace(); }
}
}
}
protected InputStreamReader buildReader(InputStream is, String encoding)
{
// If they gave an encoding, try to use it
if (encoding != null)
{
Charset charset = null;
try
{
charset = Charset.forName(encoding);
}
catch (Exception e)
{
logger.warn("JVM doesn't understand encoding '" + encoding +
"' when transforming text to pdf");
}
if (charset != null)
{
logger.debug("Processing plain text in encoding " + charset.displayName());
return new InputStreamReader(is, charset);
}
}
// Fall back on the system default
logger.debug("Processing plain text using system default encoding");
return new InputStreamReader(is);
}
private static class PagedTextToPDF extends TextToPDF
{
// REPO-1066: duplicating the following lines from org.apache.pdfbox.tools.TextToPDF because they made them private
// before the upgrade to pdfbox 2.0.8, in pdfbox 1.8, this piece of code was public in org.apache.pdfbox.pdmodel.font.PDType1Font
static PDType1Font getStandardFont(String name)
{
return STANDARD_14.get(name);
}
private static final Map<String, PDType1Font> STANDARD_14 = new HashMap<>();
static
{
STANDARD_14.put(PDType1Font.TIMES_ROMAN.getBaseFont(), PDType1Font.TIMES_ROMAN);
STANDARD_14.put(PDType1Font.TIMES_BOLD.getBaseFont(), PDType1Font.TIMES_BOLD);
STANDARD_14.put(PDType1Font.TIMES_ITALIC.getBaseFont(), PDType1Font.TIMES_ITALIC);
STANDARD_14.put(PDType1Font.TIMES_BOLD_ITALIC.getBaseFont(),
PDType1Font.TIMES_BOLD_ITALIC);
STANDARD_14.put(PDType1Font.HELVETICA.getBaseFont(), PDType1Font.HELVETICA);
STANDARD_14.put(PDType1Font.HELVETICA_BOLD.getBaseFont(), PDType1Font.HELVETICA_BOLD);
STANDARD_14.put(PDType1Font.HELVETICA_OBLIQUE.getBaseFont(),
PDType1Font.HELVETICA_OBLIQUE);
STANDARD_14.put(PDType1Font.HELVETICA_BOLD_OBLIQUE.getBaseFont(),
PDType1Font.HELVETICA_BOLD_OBLIQUE);
STANDARD_14.put(PDType1Font.COURIER.getBaseFont(), PDType1Font.COURIER);
STANDARD_14.put(PDType1Font.COURIER_BOLD.getBaseFont(), PDType1Font.COURIER_BOLD);
STANDARD_14.put(PDType1Font.COURIER_OBLIQUE.getBaseFont(), PDType1Font.COURIER_OBLIQUE);
STANDARD_14.put(PDType1Font.COURIER_BOLD_OBLIQUE.getBaseFont(),
PDType1Font.COURIER_BOLD_OBLIQUE);
STANDARD_14.put(PDType1Font.SYMBOL.getBaseFont(), PDType1Font.SYMBOL);
STANDARD_14.put(PDType1Font.ZAPF_DINGBATS.getBaseFont(), PDType1Font.ZAPF_DINGBATS);
}
//duplicating until here
// The following code is based on the code in TextToPDF with the addition of
// checks for page limits.
// The calling code must close the PDDocument once finished with it.
public PDDocument createPDFFromText(Reader text, int pageLimit)
throws IOException
{
//int pageLimit = (int)pageLimits.getValue();
PDDocument doc = null;
int pageCount = 0;
try
{
final int margin = 40;
float height = getFont().getFontDescriptor().getFontBoundingBox().getHeight() / 1000;
//calculate font height and increase by 5 percent.
height = height * getFontSize() * 1.05f;
doc = new PDDocument();
BufferedReader data = new BufferedReader(text);
String nextLine;
PDPage page = new PDPage();
PDPageContentStream contentStream = null;
float y = -1;
float maxStringLength = page.getMediaBox().getWidth() - 2 * margin;
// There is a special case of creating a PDF document from an empty string.
boolean textIsEmpty = true;
outer:
while ((nextLine = data.readLine()) != null)
{
// The input text is nonEmpty. New pages will be created and added
// to the PDF document as they are needed, depending on the length of
// the text.
textIsEmpty = false;
String[] lineWords = nextLine.trim().split(" ");
int lineIndex = 0;
while (lineIndex < lineWords.length)
{
final StringBuilder nextLineToDraw = new StringBuilder();
float lengthIfUsingNextWord = 0;
do
{
nextLineToDraw.append(lineWords[lineIndex]);
nextLineToDraw.append(" ");
lineIndex++;
if (lineIndex < lineWords.length)
{
String lineWithNextWord = nextLineToDraw.toString() + lineWords[lineIndex];
lengthIfUsingNextWord =
(getFont().getStringWidth(
lineWithNextWord) / 1000) * getFontSize();
}
}
while (lineIndex < lineWords.length &&
lengthIfUsingNextWord < maxStringLength);
if (y < margin)
{
int test = pageCount + 1;
if (pageLimit > 0 && (pageCount++ >= pageLimit))
{
// pageLimits.getAction().throwIOExceptionIfRequired("Page limit ("+pageLimit+
// ") reached.", transformerDebug);
break outer;
}
// We have crossed the end-of-page boundary and need to extend the
// document by another page.
page = new PDPage();
doc.addPage(page);
if (contentStream != null)
{
contentStream.endText();
contentStream.close();
}
contentStream = new PDPageContentStream(doc, page);
contentStream.setFont(getFont(), getFontSize());
contentStream.beginText();
y = page.getMediaBox().getHeight() - margin + height;
contentStream.moveTextPositionByAmount(margin, y);
}
//System.out.println( "Drawing string at " + x + "," + y );
if (contentStream == null)
{
throw new IOException("Error:Expected non-null content stream.");
}
contentStream.moveTextPositionByAmount(0, -height);
y -= height;
contentStream.drawString(nextLineToDraw.toString());
}
}
// If the input text was the empty string, then the above while loop will have short-circuited
// and we will not have added any PDPages to the document.
// So in order to make the resultant PDF document readable by Adobe Reader etc, we'll add an empty page.
if (textIsEmpty)
{
doc.addPage(page);
}
if (contentStream != null)
{
contentStream.endText();
contentStream.close();
}
}
catch (IOException io)
{
if (doc != null)
{
doc.close();
}
throw io;
}
return doc;
}
}
private int parseInt(String s, String paramName)
{
try
{
return Integer.valueOf(s);
}
catch (NumberFormatException e)
{
throw new IllegalArgumentException(paramName + " parameter must be an integer.");
}
}
}

View File

@@ -0,0 +1,162 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import static org.alfresco.transformer.transformers.StringExtractingContentTransformer.SOURCE_ENCODING;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.Map;
import org.junit.Test;
public class HtmlParserContentTransformerTest
{
private static final String SOURCE_MIMETYPE = "text/html";
private static final String TARGET_MIMETYPE = "text/plain";
HtmlParserContentTransformer transformer = new HtmlParserContentTransformer();
/**
* Checks that we correctly handle text in different encodings,
* no matter if the encoding is specified on the Content Property
* or in a meta tag within the HTML itself. (ALF-10466)
*
* On Windows, org.htmlparser.beans.StringBean.carriageReturn() appends a new system dependent new line
* so we must be careful when checking the returned text
*/
@Test
public void testEncodingHandling() throws Exception
{
final String NEWLINE = System.getProperty("line.separator");
final String TITLE = "Testing!";
final String TEXT_P1 = "This is some text in English";
final String TEXT_P2 = "This is more text in English";
final String TEXT_P3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
String partA = "<html><head><title>" + TITLE + "</title></head>" + NEWLINE;
String partB = "<body><p>" + TEXT_P1 + "</p>" + NEWLINE +
"<p>" + TEXT_P2 + "</p>" + NEWLINE +
"<p>" + TEXT_P3 + "</p>" + NEWLINE;
String partC = "</body></html>";
final String expected = TITLE + NEWLINE + TEXT_P1 + NEWLINE + TEXT_P2 + NEWLINE + TEXT_P3 + NEWLINE;
File tmpS = null;
File tmpD = null;
try
{
// Content set to ISO 8859-1
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
writeToFile(tmpS, partA + partB + partC, "ISO-8859-1");
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
Map<String, String> parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "ISO-8859-1");
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
tmpD.delete();
// Content set to UTF-8
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
writeToFile(tmpS, partA + partB + partC, "UTF-8");
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "UTF-8");
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
tmpD.delete();
// Content set to UTF-16
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
writeToFile(tmpS, partA + partB + partC, "UTF-16");
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "UTF-16");
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
tmpD.delete();
// Note - since HTML Parser 2.0 META tags specifying the
// document encoding will ONLY be respected if the original
// content type was set to ISO-8859-1.
//
// This means there is now only one test which we can perform
// to ensure that this now-limited overriding of the encoding
// takes effect.
// Content set to ISO 8859-1, meta set to UTF-8
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
String str = partA +
"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">" +
partB + partC;
writeToFile(tmpS, str, "UTF-8");
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "ISO-8859-1");
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
tmpD.delete();
// Note - we can't test UTF-16 with only a meta encoding,
// because without that the parser won't know about the
// 2 byte format so won't be able to identify the meta tag
}
finally
{
if (tmpS != null && tmpS.exists()) tmpS.delete();
if (tmpD != null && tmpD.exists()) tmpD.delete();
}
}
private void writeToFile(File file, String content, String encoding) throws Exception
{
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
{
ow.append(content);
}
}
private String readFromFile(File file, final String encoding) throws Exception
{
return new String(Files.readAllBytes(file.toPath()), encoding);
}
}

View File

@@ -0,0 +1,148 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import static org.alfresco.transformer.transformers.TextToPdfContentTransformer.PAGE_LIMIT;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.junit.Before;
import org.junit.Test;
public class TextToPdfContentTransformerTest
{
TextToPdfContentTransformer transformer = new TextToPdfContentTransformer();
@Before
public void setUp()
{
transformer.setStandardFont("Times-Roman");
transformer.setFontSize(20);
}
@Test
public void testUnlimitedPages() throws Exception
{
transformTextAndCheckPageLength(-1);
}
@Test
public void testLimitedTo1Page() throws Exception
{
transformTextAndCheckPageLength(1);
}
@Test
public void testLimitedTo2Pages() throws Exception
{
transformTextAndCheckPageLength(2);
}
@Test
public void testLimitedTo50Pages() throws Exception
{
transformTextAndCheckPageLength(50);
}
private void transformTextAndCheckPageLength(int pageLimit) throws Exception
{
int pageLength = 32;
int lines = (pageLength + 10) * ((pageLimit > 0) ? pageLimit : 1);
StringBuilder sb = new StringBuilder();
String checkText = null;
int cutoff = pageLimit * pageLength;
for (int i = 1; i <= lines; i++)
{
sb.append(i);
sb.append(" I must not talk in class or feed my homework to my cat.\n");
if (i == cutoff)
checkText = sb.toString();
}
sb.append("\nBart\n");
String text = sb.toString();
checkText = (checkText == null) ? clean(text) : clean(checkText);
transformTextAndCheck(text, "UTF-8", checkText, String.valueOf(pageLimit));
}
private void transformTextAndCheck(String text, String encoding, String checkText,
String pageLimit) throws Exception
{
// Get a reader for the text
File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
writeToFile(sourceFile, text, encoding);
// And a temp writer
File targetFile = File.createTempFile("AlfrescoTestTarget_", ".pdf");
// Transform to PDF
Map<String, String> parameters = new HashMap<>();
parameters.put(PAGE_LIMIT, pageLimit);
transformer.transform(sourceFile, targetFile, "text/plain", "application/pdf", parameters);
// Read back in the PDF and check it
PDDocument doc = PDDocument.load(targetFile);
PDFTextStripper textStripper = new PDFTextStripper();
StringWriter textWriter = new StringWriter();
textStripper.writeText(doc, textWriter);
doc.close();
String roundTrip = clean(textWriter.toString());
assertEquals(
"Incorrect text in PDF when starting from text in " + encoding,
checkText, roundTrip
);
sourceFile.delete();
targetFile.delete();
}
private String clean(String text)
{
text = text.replaceAll("\\s+\\r", "");
text = text.replaceAll("\\s+\\n", "");
text = text.replaceAll("\\r", "");
text = text.replaceAll("\\n", "");
return text;
}
private void writeToFile(File file, String content, String encoding) throws Exception
{
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
{
ow.append(content);
}
}
}