mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-08-07 17:48:35 +00:00
ATS-671: Split engines into fat & skinny modules (ATS-674) (#192)
Each transform engine project has been separated into 2 modules so that an executable and non-executable jar can be created. Modules have been renamed such that *docker* has been removed from the artifactIds and project names. Co-authored-by: Erik Knizat <erik.knizat@alfresco.com> Co-authored-by: David Edwards <david.edwards@alfresco.com>
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
### Licenses
|
||||
|
||||
* htmlparser http://htmlparser.sourceforge.net/license.html
|
||||
* commons-compress http://jakarta.apache.org/commons/
|
||||
* pdfbox-tools http://pdfbox.apache.org/
|
||||
* poi-ooxml http://poi.apache.org/
|
||||
* commons-compress, PDFBox and poi-ooxml are from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0 or the
|
||||
[Apache 2.0.txt](https://github.com/Alfresco/acs-community-packaging/blob/master/distribution/src/main/resources/licenses/3rd-party/Apache%202.0.txt)
|
||||
file placed in the root directory of the docker image.
|
91
alfresco-transform-misc/alfresco-transform-misc/pom.xml
Normal file
91
alfresco-transform-misc/alfresco-transform-misc/pom.xml
Normal file
@@ -0,0 +1,91 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>alfresco-transform-misc</artifactId>
|
||||
<name>Alfresco Miscellaneous Transformers</name>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<parent>
|
||||
<artifactId>alfresco-transform-core</artifactId>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<version>2.2.0-SNAPSHOT</version>
|
||||
<relativePath>../../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<artifactId>alfresco-transformer-base</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- HtmlParserContentTransformer -->
|
||||
<dependency>
|
||||
<groupId>org.htmlparser</groupId>
|
||||
<artifactId>htmlparser</artifactId>
|
||||
<version>2.1</version>
|
||||
</dependency>
|
||||
|
||||
<!-- AppleIWorksContentTransformer -->
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-compress</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- TextToPdfContentTransformer -->
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox-tools</artifactId>
|
||||
<version>2.0.19</version>
|
||||
</dependency>
|
||||
|
||||
<!-- OOXMLThumbnailContentTransformer -->
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml</artifactId>
|
||||
<version>${dependency.poi.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- EMLTransformer -->
|
||||
<dependency>
|
||||
<groupId>com.sun.mail</groupId>
|
||||
<artifactId>javax.mail</artifactId>
|
||||
<version>1.6.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml</artifactId>
|
||||
<version>4.1.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.13</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>license-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-failsafe-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
@@ -0,0 +1,116 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
/**
|
||||
* Converts Apple iWorks files to JPEGs for thumbnailing and previewing.
|
||||
* The transformer will only work for iWorks 2013/14 files. Support for iWorks 2008/9 has been dropped as we cannot
|
||||
* support both, because the newer format does not contain a PDF. If we say this transformer supports PDF, Share will
|
||||
* assume incorrectly that we can convert to PDF and we would only get a preview for the older format and never the
|
||||
* newer one. Both formats have the same mimetype.
|
||||
*
|
||||
* <p>
|
||||
* This code is based on a class of the same name originally implemented in alfresco-repository.
|
||||
* </p>
|
||||
*
|
||||
* @author Neil Mc Erlean
|
||||
* @author eknizat
|
||||
* @since 4.0
|
||||
*/
|
||||
public class AppleIWorksContentTransformer implements SelectableTransformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(
|
||||
AppleIWorksContentTransformer.class);
|
||||
|
||||
// Apple's zip entry names for previews in iWorks have changed over time.
|
||||
private static final List<String> PDF_PATHS = ImmutableList.of(
|
||||
"QuickLook/Preview.pdf"); // iWorks 2008/9
|
||||
private static final List<String> JPG_PATHS = ImmutableList.of(
|
||||
"QuickLook/Thumbnail.jpg", // iWorks 2008/9
|
||||
"preview.jpg"); // iWorks 2013/14 (720 x 552) We use the best quality image. Others are:
|
||||
// (225 x 173) preview-web.jpg
|
||||
// (53 x 41) preview-micro.jpg
|
||||
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters)
|
||||
{
|
||||
logger.debug("Performing IWorks to jpeg transform with sourceMimetype={} targetMimetype={}",
|
||||
sourceMimetype, targetMimetype);
|
||||
|
||||
// iWorks files are zip (or package) files.
|
||||
// If it's not a zip file, the resultant ZipException will be caught as an IOException below.
|
||||
try (ZipArchiveInputStream iWorksZip = new ZipArchiveInputStream(
|
||||
new BufferedInputStream(new FileInputStream(sourceFile))))
|
||||
{
|
||||
// Look through the zip file entries for the preview/thumbnail.
|
||||
List<String> paths = MIMETYPE_IMAGE_JPEG.equals(targetMimetype) ? JPG_PATHS : PDF_PATHS;
|
||||
ZipArchiveEntry entry;
|
||||
boolean found = false;
|
||||
while ((entry = iWorksZip.getNextZipEntry()) != null)
|
||||
{
|
||||
String name = entry.getName();
|
||||
if (paths.contains(name))
|
||||
{
|
||||
Files.copy(iWorksZip, targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found)
|
||||
{
|
||||
throw new RuntimeException(
|
||||
"The source " + sourceMimetype + " file did not contain a " + targetMimetype + " preview");
|
||||
}
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
throw new RuntimeException(
|
||||
"Unable to transform " + sourceMimetype + " file. It should have been a zip format file.",
|
||||
e);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,233 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_MULTIPART_ALTERNATIVE;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import javax.mail.MessagingException;
|
||||
import javax.mail.Multipart;
|
||||
import javax.mail.Part;
|
||||
import javax.mail.Session;
|
||||
import javax.mail.internet.MimeMessage;
|
||||
|
||||
import org.alfresco.transformer.fs.FileManager;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Uses javax.mail.MimeMessage to generate plain text versions of RFC822 email
|
||||
* messages. Searches for all text content parts, and returns them. Any
|
||||
* attachments are ignored. TIKA Note - could be replaced with the Tika email
|
||||
* parser. Would require a recursing parser to be specified, but not the full
|
||||
* Auto one (we don't want attachments), just one containing text and html
|
||||
* related parsers.
|
||||
*
|
||||
* <p>
|
||||
* This code is based on a class of the same name originally implemented in alfresco-repository.
|
||||
* </p>
|
||||
*/
|
||||
public class EMLTransformer implements SelectableTransformer
|
||||
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(EMLTransformer.class);
|
||||
|
||||
private static final String CHARSET = "charset";
|
||||
private static final String DEFAULT_ENCODING = "UTF-8";
|
||||
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters) throws Exception
|
||||
{
|
||||
logger.debug("Performing RFC822 to text transform.");
|
||||
// Use try with resource
|
||||
try (InputStream contentInputStream = new BufferedInputStream(
|
||||
new FileInputStream(sourceFile));
|
||||
Writer bufferedFileWriter = new BufferedWriter(new FileWriter(targetFile)))
|
||||
{
|
||||
MimeMessage mimeMessage = new MimeMessage(Session.getDefaultInstance(new Properties()),
|
||||
contentInputStream);
|
||||
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
Object content = mimeMessage.getContent();
|
||||
if (content instanceof Multipart)
|
||||
{
|
||||
processMultiPart((Multipart) content, sb);
|
||||
}
|
||||
else
|
||||
{
|
||||
sb.append(content.toString());
|
||||
}
|
||||
bufferedFileWriter.write(sb.toString());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find "text" parts of message recursively and appends it to sb StringBuilder
|
||||
*
|
||||
* @param multipart Multipart to process
|
||||
* @param sb StringBuilder
|
||||
* @throws MessagingException
|
||||
* @throws IOException
|
||||
*/
|
||||
private void processMultiPart(Multipart multipart, StringBuilder sb) throws MessagingException,
|
||||
IOException
|
||||
{
|
||||
boolean isAlternativeMultipart = multipart.getContentType().contains(
|
||||
MIMETYPE_MULTIPART_ALTERNATIVE);
|
||||
if (isAlternativeMultipart)
|
||||
{
|
||||
processAlternativeMultipart(multipart, sb);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0, n = multipart.getCount(); i < n; i++)
|
||||
{
|
||||
Part part = multipart.getBodyPart(i);
|
||||
if (part.getContent() instanceof Multipart)
|
||||
{
|
||||
processMultiPart((Multipart) part.getContent(), sb);
|
||||
}
|
||||
else
|
||||
{
|
||||
processPart(part, sb);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the suitable part from an multipart/alternative and appends it's text content to StringBuilder sb
|
||||
*
|
||||
* @param multipart
|
||||
* @param sb
|
||||
* @throws IOException
|
||||
* @throws MessagingException
|
||||
*/
|
||||
private void processAlternativeMultipart(Multipart multipart, StringBuilder sb) throws
|
||||
IOException, MessagingException
|
||||
{
|
||||
Part partToUse = null;
|
||||
for (int i = 0, n = multipart.getCount(); i < n; i++)
|
||||
{
|
||||
Part part = multipart.getBodyPart(i);
|
||||
if (part.getContentType().contains(MIMETYPE_TEXT_PLAIN))
|
||||
{
|
||||
partToUse = part;
|
||||
break;
|
||||
}
|
||||
else if (part.getContentType().contains(MIMETYPE_HTML))
|
||||
{
|
||||
partToUse = part;
|
||||
}
|
||||
else if (part.getContentType().contains(MIMETYPE_MULTIPART_ALTERNATIVE))
|
||||
{
|
||||
if (part.getContent() instanceof Multipart)
|
||||
{
|
||||
processAlternativeMultipart((Multipart) part.getContent(), sb);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (partToUse != null)
|
||||
{
|
||||
processPart(partToUse, sb);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds text on a given mail part. Accepted parts types are text/html and text/plain.
|
||||
* Attachments are ignored
|
||||
*
|
||||
* @param part
|
||||
* @param sb
|
||||
* @throws IOException
|
||||
* @throws MessagingException
|
||||
*/
|
||||
private void processPart(Part part, StringBuilder sb) throws IOException, MessagingException
|
||||
{
|
||||
boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition());
|
||||
if (isAttachment)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (part.getContentType().contains(MIMETYPE_TEXT_PLAIN))
|
||||
{
|
||||
sb.append(part.getContent().toString());
|
||||
}
|
||||
else if (part.getContentType().contains(MIMETYPE_HTML))
|
||||
{
|
||||
String mailPartContent = part.getContent().toString();
|
||||
|
||||
//create a temporary html file with same mail part content and encoding
|
||||
File tempHtmlFile = FileManager.TempFileProvider.createTempFile("EMLTransformer_",
|
||||
".html");
|
||||
String encoding = getMailPartContentEncoding(part);
|
||||
try (OutputStreamWriter osWriter = new OutputStreamWriter(
|
||||
new FileOutputStream(tempHtmlFile), encoding))
|
||||
{
|
||||
osWriter.write(mailPartContent);
|
||||
}
|
||||
|
||||
//transform html file's content to plain text
|
||||
HtmlParserContentTransformer.EncodingAwareStringBean extractor = new HtmlParserContentTransformer.EncodingAwareStringBean();
|
||||
extractor.setCollapse(false);
|
||||
extractor.setLinks(false);
|
||||
extractor.setReplaceNonBreakingSpaces(false);
|
||||
extractor.setURL(tempHtmlFile, encoding);
|
||||
sb.append(extractor.getStrings());
|
||||
|
||||
tempHtmlFile.delete();
|
||||
}
|
||||
}
|
||||
|
||||
private String getMailPartContentEncoding(Part part) throws MessagingException
|
||||
{
|
||||
String encoding = DEFAULT_ENCODING;
|
||||
String contentType = part.getContentType();
|
||||
int startIndex = contentType.indexOf(CHARSET);
|
||||
if (startIndex > 0)
|
||||
{
|
||||
encoding = contentType.substring(startIndex + CHARSET.length() + 1)
|
||||
.replaceAll("\"", "");
|
||||
}
|
||||
return encoding;
|
||||
}
|
||||
}
|
@@ -0,0 +1,190 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.net.URLConnection;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.IllegalCharsetNameException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.htmlparser.Parser;
|
||||
import org.htmlparser.beans.StringBean;
|
||||
import org.htmlparser.util.ParserException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Content transformer which wraps the HTML Parser library for
|
||||
* parsing HTML content.
|
||||
*
|
||||
* <p>
|
||||
* This code is based on a class of the same name originally implemented in alfresco-repository.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Since HTML Parser was updated from v1.6 to v2.1, META tags
|
||||
* defining an encoding for the content via http-equiv=Content-Type
|
||||
* will ONLY be respected if the encoding of the content item
|
||||
* itself is set to ISO-8859-1.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Tika Note - could be converted to use the Tika HTML parser,
|
||||
* but we'd potentially need a custom text handler to replicate
|
||||
* the current settings around links and non-breaking spaces.
|
||||
* </p>
|
||||
*
|
||||
* @author Derek Hulley
|
||||
* @author eknizat
|
||||
* @see <a href="http://htmlparser.sourceforge.net/">http://htmlparser.sourceforge.net</a>
|
||||
* @see org.htmlparser.beans.StringBean
|
||||
* @see <a href="http://sourceforge.net/tracker/?func=detail&aid=1644504&group_id=24399&atid=381401">HTML Parser</a>
|
||||
*/
|
||||
public class HtmlParserContentTransformer implements SelectableTransformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(
|
||||
HtmlParserContentTransformer.class);
|
||||
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters) throws Exception
|
||||
{
|
||||
String sourceEncoding = parameters.get(SOURCE_ENCODING);
|
||||
checkEncodingParameter(sourceEncoding, SOURCE_ENCODING);
|
||||
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Performing HTML to text transform with sourceEncoding=" + sourceEncoding);
|
||||
}
|
||||
|
||||
// Create the extractor
|
||||
EncodingAwareStringBean extractor = new EncodingAwareStringBean();
|
||||
extractor.setCollapse(false);
|
||||
extractor.setLinks(false);
|
||||
extractor.setReplaceNonBreakingSpaces(false);
|
||||
extractor.setURL(sourceFile, sourceEncoding);
|
||||
// get the text
|
||||
String text = extractor.getStrings();
|
||||
|
||||
// write it to the writer
|
||||
try (Writer writer = new BufferedWriter(
|
||||
new OutputStreamWriter(new FileOutputStream(targetFile))))
|
||||
{
|
||||
writer.write(text);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkEncodingParameter(String encoding, String parameterName)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (encoding != null && !Charset.isSupported(encoding))
|
||||
{
|
||||
throw new IllegalArgumentException(
|
||||
parameterName + "=" + encoding + " is not supported by the JVM.");
|
||||
}
|
||||
}
|
||||
catch (IllegalCharsetNameException e)
|
||||
{
|
||||
throw new IllegalArgumentException(
|
||||
parameterName + "=" + encoding + " is not a valid encoding.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* This code is based on a class of the same name, originally implemented in alfresco-repository.
|
||||
* </p>
|
||||
*
|
||||
* A version of {@link StringBean} which allows control of the
|
||||
* encoding in the underlying HTML Parser.
|
||||
* Unfortunately, StringBean doesn't allow easy over-riding of
|
||||
* this, so we have to duplicate some code to control this.
|
||||
* This allows us to correctly handle HTML files where the encoding
|
||||
* is specified against the content property (rather than in the
|
||||
* HTML Head Meta), see ALF-10466 for details.
|
||||
*/
|
||||
public static class EncodingAwareStringBean extends StringBean
|
||||
{
|
||||
private static final long serialVersionUID = -9033414360428669553L;
|
||||
|
||||
/**
|
||||
* Sets the File to extract strings from, and the encoding
|
||||
* it's in (if known to Alfresco)
|
||||
*
|
||||
* @param file The File that text should be fetched from.
|
||||
* @param encoding The encoding of the input
|
||||
*/
|
||||
public void setURL(File file, String encoding)
|
||||
{
|
||||
String previousURL = getURL();
|
||||
String newURL = file.getAbsolutePath();
|
||||
|
||||
if (previousURL == null || !newURL.equals(previousURL))
|
||||
{
|
||||
try
|
||||
{
|
||||
URLConnection conn = getConnection();
|
||||
|
||||
if (null == mParser)
|
||||
{
|
||||
mParser = new Parser(newURL);
|
||||
}
|
||||
else
|
||||
{
|
||||
mParser.setURL(newURL);
|
||||
}
|
||||
|
||||
if (encoding != null)
|
||||
{
|
||||
mParser.setEncoding(encoding);
|
||||
}
|
||||
|
||||
mPropertySupport.firePropertyChange(StringBean.PROP_URL_PROPERTY, previousURL,
|
||||
getURL());
|
||||
mPropertySupport.firePropertyChange(StringBean.PROP_CONNECTION_PROPERTY, conn,
|
||||
mParser.getConnection());
|
||||
setStrings();
|
||||
}
|
||||
catch (ParserException pe)
|
||||
{
|
||||
updateStrings(pe.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getEncoding()
|
||||
{
|
||||
return mParser.getEncoding();
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,130 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.opc.PackagePart;
|
||||
import org.apache.poi.openxml4j.opc.PackageRelationship;
|
||||
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
|
||||
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Extracts out Thumbnail JPEGs from OOXML files for thumbnailing and previewing.
|
||||
* This transformer will only work for OOXML files where thumbnailing was enabled,
|
||||
* which isn't on by default on Windows, but is more common on Mac.
|
||||
*
|
||||
* <p>
|
||||
* This code is based on a class of the same name originally implemented in alfresco-repository.
|
||||
* </p>
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author eknizat
|
||||
*/
|
||||
public class OOXMLThumbnailContentTransformer implements SelectableTransformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(
|
||||
OOXMLThumbnailContentTransformer.class);
|
||||
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters) throws Exception
|
||||
{
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Performing OOXML to jpeg transform with sourceMimetype=" + sourceMimetype
|
||||
+ " targetMimetype=" + targetMimetype);
|
||||
}
|
||||
|
||||
try (OPCPackage pkg = OPCPackage.open(sourceFile.getPath()))
|
||||
{
|
||||
|
||||
// Does it have a thumbnail?
|
||||
PackageRelationshipCollection rels = pkg.getRelationshipsByType(
|
||||
PackageRelationshipTypes.THUMBNAIL);
|
||||
if (rels.size() > 0)
|
||||
{
|
||||
// Get the thumbnail part
|
||||
PackageRelationship tRel = rels.getRelationship(0);
|
||||
PackagePart tPart = pkg.getPart(tRel);
|
||||
|
||||
// Write it to the target
|
||||
InputStream tStream = tPart.getInputStream();
|
||||
Files.copy(tStream, targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
||||
tStream.close();
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.debug("No thumbnail present in file.");
|
||||
throw new Exception(
|
||||
"No thumbnail present in file, unable to generate " + targetMimetype);
|
||||
}
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
throw new RuntimeException("Unable to transform file.", e);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
// TODO Add this back to engine_config.json when the transformer is fixed for java 11
|
||||
{
|
||||
"transformerName": "ooxmlThumbnail",
|
||||
"supportedSourceAndTargetList": [
|
||||
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.ms-word.document.macroenabled.12", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.template", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.ms-word.template.macroenabled.12", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.ms-powerpoint.presentation.macroenabled.12", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.presentationml.template", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.ms-powerpoint.template.macroenabled.12", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.ms-powerpoint.addin.macroenabled.12", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.presentationml.slide", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.ms-powerpoint.slide.macroenabled.12", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.template", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.ms-excel.sheet.macroenabled.12", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.ms-excel.template.macroenabled.12", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.ms-excel.addin.macroenabled.12", "targetMediaType": "image/jpeg"},
|
||||
{"sourceMediaType": "application/vnd.ms-excel.sheet.binary.macroenabled.12", "targetMediaType": "image/jpeg"}
|
||||
],
|
||||
"transformOptions": [
|
||||
]
|
||||
}
|
||||
*/
|
||||
}
|
@@ -0,0 +1,52 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Implemented by transformers used by {@link SelectingTransformer}.
|
||||
*
|
||||
* @author eknizat
|
||||
*/
|
||||
public interface SelectableTransformer
|
||||
{
|
||||
String SOURCE_ENCODING = "sourceEncoding";
|
||||
String TARGET_ENCODING = "targetEncoding";
|
||||
|
||||
/**
|
||||
* Implementation of the actual transformation.
|
||||
*
|
||||
* @param sourceFile
|
||||
* @param targetFile
|
||||
* @param parameters
|
||||
* @throws Exception
|
||||
*/
|
||||
void transform(File sourceFile, File targetFile, String sourceMimetype,
|
||||
String targetMimetype, Map<String, String> parameters) throws Exception;
|
||||
}
|
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import static org.springframework.http.HttpStatus.BAD_REQUEST;
|
||||
import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Map;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import org.alfresco.transform.exceptions.TransformException;
|
||||
import org.alfresco.transformer.logging.LogEntry;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
|
||||
/**
|
||||
* The SelectingTransformer selects a registered {@link SelectableTransformer}
|
||||
* and delegates the transformation to its implementation.
|
||||
*
|
||||
* @author eknizat
|
||||
*/
|
||||
public class SelectingTransformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(SelectingTransformer.class);
|
||||
|
||||
private final Map<String, SelectableTransformer> transformers = ImmutableMap
|
||||
.<String, SelectableTransformer>builder()
|
||||
.put("appleIWorks", new AppleIWorksContentTransformer())
|
||||
.put("html", new HtmlParserContentTransformer())
|
||||
.put("string", new StringExtractingContentTransformer())
|
||||
.put("textToPdf", new TextToPdfContentTransformer())
|
||||
.put("rfc822", new EMLTransformer())
|
||||
.put("ooXmlThumbnail", new OOXMLThumbnailContentTransformer())
|
||||
.build();
|
||||
|
||||
/**
|
||||
* Performs a transform using a transformer selected based on the provided sourceMimetype and targetMimetype
|
||||
*
|
||||
* @param transform the name of the transformer
|
||||
* @param sourceFile File to transform from
|
||||
* @param targetFile File to transform to
|
||||
* @param sourceMimetype Mimetype of the source file
|
||||
* @throws TransformException if there was a problem internally
|
||||
*/
|
||||
public void transform(String transform, File sourceFile, File targetFile, String sourceMimetype,
|
||||
String targetMimetype, Map<String, String> parameters) throws TransformException
|
||||
{
|
||||
try
|
||||
{
|
||||
final SelectableTransformer transformer = transformers.get(transform);
|
||||
logOptions(sourceFile, targetFile, parameters);
|
||||
transformer.transform(sourceFile, targetFile, sourceMimetype, targetMimetype,
|
||||
parameters);
|
||||
}
|
||||
catch (IllegalArgumentException e)
|
||||
{
|
||||
throw new TransformException(BAD_REQUEST.value(), getMessage(e));
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new TransformException(INTERNAL_SERVER_ERROR.value(), getMessage(e));
|
||||
}
|
||||
if (!targetFile.exists())
|
||||
{
|
||||
throw new TransformException(INTERNAL_SERVER_ERROR.value(),
|
||||
"Transformer failed to create an output file. Target file does not exist.");
|
||||
}
|
||||
if (sourceFile.length() > 0 && targetFile.length() == 0)
|
||||
{
|
||||
throw new TransformException(INTERNAL_SERVER_ERROR.value(),
|
||||
"Transformer failed to create an output file. Target file is empty but source file was not empty.");
|
||||
}
|
||||
}
|
||||
|
||||
private static String getMessage(Exception e)
|
||||
{
|
||||
return e.getMessage() == null || e.getMessage().isEmpty() ? e.getClass().getSimpleName() : e.getMessage();
|
||||
}
|
||||
|
||||
private static void logOptions(File sourceFile, File targetFile, Map<String, String> parameters)
|
||||
{
|
||||
StringJoiner sj = new StringJoiner(" ");
|
||||
parameters.forEach((k, v) -> sj.add(
|
||||
"--" + k + "=" + v)); // keeping the existing style used in other T-Engines
|
||||
sj.add(getExtension(sourceFile));
|
||||
sj.add(getExtension(targetFile));
|
||||
LogEntry.setOptions(sj.toString());
|
||||
}
|
||||
|
||||
private static String getExtension(File file)
|
||||
{
|
||||
final String name = file.getName();
|
||||
int i = name.lastIndexOf('.');
|
||||
return i == -1 ? "???" : name.substring(i + 1);
|
||||
}
|
||||
}
|
@@ -0,0 +1,155 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Reader;
|
||||
import java.io.Writer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.IllegalCharsetNameException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* Converts any textual format to plain text.
|
||||
* <p>
|
||||
* The transformation is sensitive to the source and target string encodings.
|
||||
*
|
||||
*
|
||||
* <p>
|
||||
* This code is based on a class of the same name originally implemented in alfresco-repository.
|
||||
* </p>
|
||||
*
|
||||
* @author Derek Hulley
|
||||
* @author eknizat
|
||||
*/
|
||||
public class StringExtractingContentTransformer implements SelectableTransformer
|
||||
{
|
||||
|
||||
private static final Log logger = LogFactory.getLog(StringExtractingContentTransformer.class);
|
||||
|
||||
/**
|
||||
* Text to text conversions are done directly using the content reader and writer string
|
||||
* manipulation methods.
|
||||
* <p>
|
||||
* Extraction of text from binary content attempts to take the possible character
|
||||
* encoding into account. The text produced from this will, if the encoding was correct,
|
||||
* be unformatted but valid.
|
||||
*/
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters) throws Exception
|
||||
{
|
||||
String sourceEncoding = parameters.get(SOURCE_ENCODING);
|
||||
String targetEncoding = parameters.get(TARGET_ENCODING);
|
||||
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Performing text to text transform with sourceEncoding=" + sourceEncoding
|
||||
+ " targetEncoding=" + targetEncoding);
|
||||
}
|
||||
|
||||
Reader charReader = null;
|
||||
Writer charWriter = null;
|
||||
try
|
||||
{
|
||||
// Build reader
|
||||
if (sourceEncoding == null)
|
||||
{
|
||||
charReader = new BufferedReader(
|
||||
new InputStreamReader(new FileInputStream(sourceFile)));
|
||||
}
|
||||
else
|
||||
{
|
||||
checkEncodingParameter(sourceEncoding, SOURCE_ENCODING);
|
||||
charReader = new BufferedReader(
|
||||
new InputStreamReader(new FileInputStream(sourceFile), sourceEncoding));
|
||||
}
|
||||
|
||||
// Build writer
|
||||
if (targetEncoding == null)
|
||||
{
|
||||
charWriter = new BufferedWriter(
|
||||
new OutputStreamWriter(new FileOutputStream(targetFile)));
|
||||
}
|
||||
else
|
||||
{
|
||||
checkEncodingParameter(targetEncoding, TARGET_ENCODING);
|
||||
charWriter = new BufferedWriter(
|
||||
new OutputStreamWriter(new FileOutputStream(targetFile), targetEncoding));
|
||||
}
|
||||
|
||||
// copy from the one to the other
|
||||
char[] buffer = new char[8192];
|
||||
int readCount = 0;
|
||||
while (readCount > -1)
|
||||
{
|
||||
// write the last read count number of bytes
|
||||
charWriter.write(buffer, 0, readCount);
|
||||
// fill the buffer again
|
||||
readCount = charReader.read(buffer);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (charReader != null)
|
||||
{
|
||||
try { charReader.close(); } catch (Throwable e) { logger.error(e); }
|
||||
}
|
||||
if (charWriter != null)
|
||||
{
|
||||
try { charWriter.close(); } catch (Throwable e) { logger.error(e); }
|
||||
}
|
||||
}
|
||||
// done
|
||||
}
|
||||
|
||||
private void checkEncodingParameter(String encoding, String paramterName)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (!Charset.isSupported(encoding))
|
||||
{
|
||||
throw new IllegalArgumentException(
|
||||
paramterName + "=" + encoding + " is not supported by the JVM.");
|
||||
}
|
||||
}
|
||||
catch (IllegalCharsetNameException e)
|
||||
{
|
||||
throw new IllegalArgumentException(
|
||||
paramterName + "=" + encoding + " is not a valid encoding.");
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,323 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.tools.TextToPDF;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* This code is based on a class of the same name originally implemented in alfresco-repository.
|
||||
* </p>
|
||||
*
|
||||
* Makes use of the <a href="http://www.pdfbox.org/">PDFBox</a> library's <code>TextToPDF</code> utility.
|
||||
*
|
||||
* @author Derek Hulley
|
||||
* @author eknizat
|
||||
*/
|
||||
public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class);
|
||||
|
||||
public static final String PAGE_LIMIT = "pageLimit";
|
||||
|
||||
private final PagedTextToPDF transformer;
|
||||
|
||||
public TextToPdfContentTransformer()
|
||||
{
|
||||
transformer = new PagedTextToPDF();
|
||||
}
|
||||
|
||||
public void setStandardFont(String fontName)
|
||||
{
|
||||
try
|
||||
{
|
||||
transformer.setFont(PagedTextToPDF.getStandardFont(fontName));
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
throw new RuntimeException(
|
||||
"Unable to set Standard Font for PDF generation: " + fontName, e);
|
||||
}
|
||||
}
|
||||
|
||||
public void setFontSize(int fontSize)
|
||||
{
|
||||
try
|
||||
{
|
||||
transformer.setFontSize(fontSize);
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
throw new RuntimeException(
|
||||
"Unable to set Font Size for PDF generation: " + fontSize);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters) throws Exception
|
||||
{
|
||||
String sourceEncoding = parameters.get(SOURCE_ENCODING);
|
||||
String stringPageLimit = parameters.get(PAGE_LIMIT);
|
||||
int pageLimit = -1;
|
||||
if (stringPageLimit != null)
|
||||
{
|
||||
pageLimit = parseInt(stringPageLimit, PAGE_LIMIT);
|
||||
}
|
||||
|
||||
PDDocument pdf = null;
|
||||
try (InputStream is = new FileInputStream(sourceFile);
|
||||
Reader ir = new BufferedReader(buildReader(is, sourceEncoding));
|
||||
OutputStream os = new BufferedOutputStream(new FileOutputStream(targetFile)))
|
||||
{
|
||||
//TransformationOptionLimits limits = getLimits(reader, writer, options);
|
||||
//TransformationOptionPair pageLimits = limits.getPagesPair();
|
||||
pdf = transformer.createPDFFromText(ir, pageLimit);
|
||||
pdf.save(os);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (pdf != null)
|
||||
{
|
||||
try { pdf.close(); } catch (Throwable e) {e.printStackTrace(); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected InputStreamReader buildReader(InputStream is, String encoding)
|
||||
{
|
||||
// If they gave an encoding, try to use it
|
||||
if (encoding != null)
|
||||
{
|
||||
Charset charset = null;
|
||||
try
|
||||
{
|
||||
charset = Charset.forName(encoding);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.warn("JVM doesn't understand encoding '" + encoding +
|
||||
"' when transforming text to pdf");
|
||||
}
|
||||
if (charset != null)
|
||||
{
|
||||
logger.debug("Processing plain text in encoding " + charset.displayName());
|
||||
return new InputStreamReader(is, charset);
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back on the system default
|
||||
logger.debug("Processing plain text using system default encoding");
|
||||
return new InputStreamReader(is);
|
||||
}
|
||||
|
||||
private static class PagedTextToPDF extends TextToPDF
|
||||
{
|
||||
// REPO-1066: duplicating the following lines from org.apache.pdfbox.tools.TextToPDF because they made them private
|
||||
// before the upgrade to pdfbox 2.0.8, in pdfbox 1.8, this piece of code was public in org.apache.pdfbox.pdmodel.font.PDType1Font
|
||||
static PDType1Font getStandardFont(String name)
|
||||
{
|
||||
return STANDARD_14.get(name);
|
||||
}
|
||||
|
||||
private static final Map<String, PDType1Font> STANDARD_14 = new HashMap<>();
|
||||
|
||||
static
|
||||
{
|
||||
STANDARD_14.put(PDType1Font.TIMES_ROMAN.getBaseFont(), PDType1Font.TIMES_ROMAN);
|
||||
STANDARD_14.put(PDType1Font.TIMES_BOLD.getBaseFont(), PDType1Font.TIMES_BOLD);
|
||||
STANDARD_14.put(PDType1Font.TIMES_ITALIC.getBaseFont(), PDType1Font.TIMES_ITALIC);
|
||||
STANDARD_14.put(PDType1Font.TIMES_BOLD_ITALIC.getBaseFont(),
|
||||
PDType1Font.TIMES_BOLD_ITALIC);
|
||||
STANDARD_14.put(PDType1Font.HELVETICA.getBaseFont(), PDType1Font.HELVETICA);
|
||||
STANDARD_14.put(PDType1Font.HELVETICA_BOLD.getBaseFont(), PDType1Font.HELVETICA_BOLD);
|
||||
STANDARD_14.put(PDType1Font.HELVETICA_OBLIQUE.getBaseFont(),
|
||||
PDType1Font.HELVETICA_OBLIQUE);
|
||||
STANDARD_14.put(PDType1Font.HELVETICA_BOLD_OBLIQUE.getBaseFont(),
|
||||
PDType1Font.HELVETICA_BOLD_OBLIQUE);
|
||||
STANDARD_14.put(PDType1Font.COURIER.getBaseFont(), PDType1Font.COURIER);
|
||||
STANDARD_14.put(PDType1Font.COURIER_BOLD.getBaseFont(), PDType1Font.COURIER_BOLD);
|
||||
STANDARD_14.put(PDType1Font.COURIER_OBLIQUE.getBaseFont(), PDType1Font.COURIER_OBLIQUE);
|
||||
STANDARD_14.put(PDType1Font.COURIER_BOLD_OBLIQUE.getBaseFont(),
|
||||
PDType1Font.COURIER_BOLD_OBLIQUE);
|
||||
STANDARD_14.put(PDType1Font.SYMBOL.getBaseFont(), PDType1Font.SYMBOL);
|
||||
STANDARD_14.put(PDType1Font.ZAPF_DINGBATS.getBaseFont(), PDType1Font.ZAPF_DINGBATS);
|
||||
}
|
||||
//duplicating until here
|
||||
|
||||
// The following code is based on the code in TextToPDF with the addition of
|
||||
// checks for page limits.
|
||||
// The calling code must close the PDDocument once finished with it.
|
||||
public PDDocument createPDFFromText(Reader text, int pageLimit)
|
||||
throws IOException
|
||||
{
|
||||
//int pageLimit = (int)pageLimits.getValue();
|
||||
PDDocument doc = null;
|
||||
int pageCount = 0;
|
||||
try
|
||||
{
|
||||
final int margin = 40;
|
||||
float height = getFont().getFontDescriptor().getFontBoundingBox().getHeight() / 1000;
|
||||
|
||||
//calculate font height and increase by 5 percent.
|
||||
height = height * getFontSize() * 1.05f;
|
||||
doc = new PDDocument();
|
||||
BufferedReader data = new BufferedReader(text);
|
||||
String nextLine;
|
||||
PDPage page = new PDPage();
|
||||
PDPageContentStream contentStream = null;
|
||||
float y = -1;
|
||||
float maxStringLength = page.getMediaBox().getWidth() - 2 * margin;
|
||||
|
||||
// There is a special case of creating a PDF document from an empty string.
|
||||
boolean textIsEmpty = true;
|
||||
|
||||
outer:
|
||||
while ((nextLine = data.readLine()) != null)
|
||||
{
|
||||
|
||||
// The input text is nonEmpty. New pages will be created and added
|
||||
// to the PDF document as they are needed, depending on the length of
|
||||
// the text.
|
||||
textIsEmpty = false;
|
||||
|
||||
String[] lineWords = nextLine.trim().split(" ");
|
||||
int lineIndex = 0;
|
||||
while (lineIndex < lineWords.length)
|
||||
{
|
||||
final StringBuilder nextLineToDraw = new StringBuilder();
|
||||
float lengthIfUsingNextWord = 0;
|
||||
do
|
||||
{
|
||||
nextLineToDraw.append(lineWords[lineIndex]);
|
||||
nextLineToDraw.append(" ");
|
||||
lineIndex++;
|
||||
if (lineIndex < lineWords.length)
|
||||
{
|
||||
String lineWithNextWord = nextLineToDraw.toString() + lineWords[lineIndex];
|
||||
lengthIfUsingNextWord =
|
||||
(getFont().getStringWidth(
|
||||
lineWithNextWord) / 1000) * getFontSize();
|
||||
}
|
||||
}
|
||||
while (lineIndex < lineWords.length &&
|
||||
lengthIfUsingNextWord < maxStringLength);
|
||||
if (y < margin)
|
||||
{
|
||||
int test = pageCount + 1;
|
||||
if (pageLimit > 0 && (pageCount++ >= pageLimit))
|
||||
{
|
||||
// pageLimits.getAction().throwIOExceptionIfRequired("Page limit ("+pageLimit+
|
||||
// ") reached.", transformerDebug);
|
||||
break outer;
|
||||
}
|
||||
|
||||
// We have crossed the end-of-page boundary and need to extend the
|
||||
// document by another page.
|
||||
page = new PDPage();
|
||||
doc.addPage(page);
|
||||
if (contentStream != null)
|
||||
{
|
||||
contentStream.endText();
|
||||
contentStream.close();
|
||||
}
|
||||
contentStream = new PDPageContentStream(doc, page);
|
||||
contentStream.setFont(getFont(), getFontSize());
|
||||
contentStream.beginText();
|
||||
y = page.getMediaBox().getHeight() - margin + height;
|
||||
contentStream.moveTextPositionByAmount(margin, y);
|
||||
}
|
||||
//System.out.println( "Drawing string at " + x + "," + y );
|
||||
|
||||
if (contentStream == null)
|
||||
{
|
||||
throw new IOException("Error:Expected non-null content stream.");
|
||||
}
|
||||
contentStream.moveTextPositionByAmount(0, -height);
|
||||
y -= height;
|
||||
contentStream.drawString(nextLineToDraw.toString());
|
||||
}
|
||||
}
|
||||
|
||||
// If the input text was the empty string, then the above while loop will have short-circuited
|
||||
// and we will not have added any PDPages to the document.
|
||||
// So in order to make the resultant PDF document readable by Adobe Reader etc, we'll add an empty page.
|
||||
if (textIsEmpty)
|
||||
{
|
||||
doc.addPage(page);
|
||||
}
|
||||
|
||||
if (contentStream != null)
|
||||
{
|
||||
contentStream.endText();
|
||||
contentStream.close();
|
||||
}
|
||||
}
|
||||
catch (IOException io)
|
||||
{
|
||||
if (doc != null)
|
||||
{
|
||||
doc.close();
|
||||
}
|
||||
throw io;
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
|
||||
private int parseInt(String s, String paramName)
|
||||
{
|
||||
try
|
||||
{
|
||||
return Integer.valueOf(s);
|
||||
}
|
||||
catch (NumberFormatException e)
|
||||
{
|
||||
throw new IllegalArgumentException(paramName + " parameter must be an integer.");
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,162 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import static org.alfresco.transformer.transformers.StringExtractingContentTransformer.SOURCE_ENCODING;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class HtmlParserContentTransformerTest
|
||||
{
|
||||
private static final String SOURCE_MIMETYPE = "text/html";
|
||||
private static final String TARGET_MIMETYPE = "text/plain";
|
||||
|
||||
HtmlParserContentTransformer transformer = new HtmlParserContentTransformer();
|
||||
|
||||
/**
|
||||
* Checks that we correctly handle text in different encodings,
|
||||
* no matter if the encoding is specified on the Content Property
|
||||
* or in a meta tag within the HTML itself. (ALF-10466)
|
||||
*
|
||||
* On Windows, org.htmlparser.beans.StringBean.carriageReturn() appends a new system dependent new line
|
||||
* so we must be careful when checking the returned text
|
||||
*/
|
||||
@Test
|
||||
public void testEncodingHandling() throws Exception
|
||||
{
|
||||
final String NEWLINE = System.getProperty("line.separator");
|
||||
final String TITLE = "Testing!";
|
||||
final String TEXT_P1 = "This is some text in English";
|
||||
final String TEXT_P2 = "This is more text in English";
|
||||
final String TEXT_P3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
|
||||
String partA = "<html><head><title>" + TITLE + "</title></head>" + NEWLINE;
|
||||
String partB = "<body><p>" + TEXT_P1 + "</p>" + NEWLINE +
|
||||
"<p>" + TEXT_P2 + "</p>" + NEWLINE +
|
||||
"<p>" + TEXT_P3 + "</p>" + NEWLINE;
|
||||
String partC = "</body></html>";
|
||||
final String expected = TITLE + NEWLINE + TEXT_P1 + NEWLINE + TEXT_P2 + NEWLINE + TEXT_P3 + NEWLINE;
|
||||
|
||||
File tmpS = null;
|
||||
File tmpD = null;
|
||||
|
||||
try
|
||||
{
|
||||
// Content set to ISO 8859-1
|
||||
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
|
||||
writeToFile(tmpS, partA + partB + partC, "ISO-8859-1");
|
||||
|
||||
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
|
||||
|
||||
Map<String, String> parameters = new HashMap<>();
|
||||
parameters.put(SOURCE_ENCODING, "ISO-8859-1");
|
||||
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
|
||||
|
||||
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
|
||||
tmpS.delete();
|
||||
tmpD.delete();
|
||||
|
||||
// Content set to UTF-8
|
||||
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
|
||||
writeToFile(tmpS, partA + partB + partC, "UTF-8");
|
||||
|
||||
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
|
||||
parameters = new HashMap<>();
|
||||
parameters.put(SOURCE_ENCODING, "UTF-8");
|
||||
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
|
||||
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
|
||||
tmpS.delete();
|
||||
tmpD.delete();
|
||||
|
||||
// Content set to UTF-16
|
||||
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
|
||||
writeToFile(tmpS, partA + partB + partC, "UTF-16");
|
||||
|
||||
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
|
||||
parameters = new HashMap<>();
|
||||
parameters.put(SOURCE_ENCODING, "UTF-16");
|
||||
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
|
||||
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
|
||||
tmpS.delete();
|
||||
tmpD.delete();
|
||||
|
||||
// Note - since HTML Parser 2.0 META tags specifying the
|
||||
// document encoding will ONLY be respected if the original
|
||||
// content type was set to ISO-8859-1.
|
||||
//
|
||||
// This means there is now only one test which we can perform
|
||||
// to ensure that this now-limited overriding of the encoding
|
||||
// takes effect.
|
||||
|
||||
// Content set to ISO 8859-1, meta set to UTF-8
|
||||
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
|
||||
String str = partA +
|
||||
"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">" +
|
||||
partB + partC;
|
||||
|
||||
writeToFile(tmpS, str, "UTF-8");
|
||||
|
||||
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
|
||||
|
||||
parameters = new HashMap<>();
|
||||
parameters.put(SOURCE_ENCODING, "ISO-8859-1");
|
||||
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
|
||||
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
|
||||
tmpS.delete();
|
||||
tmpD.delete();
|
||||
|
||||
// Note - we can't test UTF-16 with only a meta encoding,
|
||||
// because without that the parser won't know about the
|
||||
// 2 byte format so won't be able to identify the meta tag
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (tmpS != null && tmpS.exists()) tmpS.delete();
|
||||
if (tmpD != null && tmpD.exists()) tmpD.delete();
|
||||
}
|
||||
}
|
||||
|
||||
private void writeToFile(File file, String content, String encoding) throws Exception
|
||||
{
|
||||
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
|
||||
{
|
||||
ow.append(content);
|
||||
}
|
||||
}
|
||||
|
||||
private String readFromFile(File file, final String encoding) throws Exception
|
||||
{
|
||||
return new String(Files.readAllBytes(file.toPath()), encoding);
|
||||
}
|
||||
}
|
@@ -0,0 +1,148 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import static org.alfresco.transformer.transformers.TextToPdfContentTransformer.PAGE_LIMIT;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.StringWriter;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TextToPdfContentTransformerTest
|
||||
{
|
||||
TextToPdfContentTransformer transformer = new TextToPdfContentTransformer();
|
||||
|
||||
@Before
|
||||
public void setUp()
|
||||
{
|
||||
transformer.setStandardFont("Times-Roman");
|
||||
transformer.setFontSize(20);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnlimitedPages() throws Exception
|
||||
{
|
||||
transformTextAndCheckPageLength(-1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLimitedTo1Page() throws Exception
|
||||
{
|
||||
transformTextAndCheckPageLength(1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLimitedTo2Pages() throws Exception
|
||||
{
|
||||
transformTextAndCheckPageLength(2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLimitedTo50Pages() throws Exception
|
||||
{
|
||||
transformTextAndCheckPageLength(50);
|
||||
}
|
||||
|
||||
private void transformTextAndCheckPageLength(int pageLimit) throws Exception
|
||||
{
|
||||
int pageLength = 32;
|
||||
int lines = (pageLength + 10) * ((pageLimit > 0) ? pageLimit : 1);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String checkText = null;
|
||||
int cutoff = pageLimit * pageLength;
|
||||
for (int i = 1; i <= lines; i++)
|
||||
{
|
||||
sb.append(i);
|
||||
sb.append(" I must not talk in class or feed my homework to my cat.\n");
|
||||
if (i == cutoff)
|
||||
checkText = sb.toString();
|
||||
}
|
||||
sb.append("\nBart\n");
|
||||
String text = sb.toString();
|
||||
checkText = (checkText == null) ? clean(text) : clean(checkText);
|
||||
transformTextAndCheck(text, "UTF-8", checkText, String.valueOf(pageLimit));
|
||||
}
|
||||
|
||||
private void transformTextAndCheck(String text, String encoding, String checkText,
|
||||
String pageLimit) throws Exception
|
||||
{
|
||||
// Get a reader for the text
|
||||
File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
|
||||
writeToFile(sourceFile, text, encoding);
|
||||
|
||||
// And a temp writer
|
||||
File targetFile = File.createTempFile("AlfrescoTestTarget_", ".pdf");
|
||||
|
||||
// Transform to PDF
|
||||
Map<String, String> parameters = new HashMap<>();
|
||||
parameters.put(PAGE_LIMIT, pageLimit);
|
||||
transformer.transform(sourceFile, targetFile, "text/plain", "application/pdf", parameters);
|
||||
|
||||
// Read back in the PDF and check it
|
||||
PDDocument doc = PDDocument.load(targetFile);
|
||||
PDFTextStripper textStripper = new PDFTextStripper();
|
||||
StringWriter textWriter = new StringWriter();
|
||||
textStripper.writeText(doc, textWriter);
|
||||
doc.close();
|
||||
|
||||
String roundTrip = clean(textWriter.toString());
|
||||
|
||||
assertEquals(
|
||||
"Incorrect text in PDF when starting from text in " + encoding,
|
||||
checkText, roundTrip
|
||||
);
|
||||
|
||||
sourceFile.delete();
|
||||
targetFile.delete();
|
||||
}
|
||||
|
||||
private String clean(String text)
|
||||
{
|
||||
text = text.replaceAll("\\s+\\r", "");
|
||||
text = text.replaceAll("\\s+\\n", "");
|
||||
text = text.replaceAll("\\r", "");
|
||||
text = text.replaceAll("\\n", "");
|
||||
return text;
|
||||
}
|
||||
|
||||
private void writeToFile(File file, String content, String encoding) throws Exception
|
||||
{
|
||||
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
|
||||
{
|
||||
ow.append(content);
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user