REPO-4331: Add remaining core transformers (#45)

* REPO-4331: Add remaining core transformers
* HtmlParserContentTransformer
* AppleIWorksContentTransformer
* StringExtractingContentTransformer
* TextToPdfContentTransformer
* OOXMLThumbnailContentTransformer
This commit is contained in:
eknizat
2019-06-20 12:31:38 +01:00
committed by GitHub
parent f44e675423
commit ff0f659ded
57 changed files with 3109 additions and 257 deletions

View File

@@ -0,0 +1,294 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.transform.client.model.TransformRequest;
import org.alfresco.transformer.transformers.AppleIWorksContentTransformer;
import org.alfresco.transformer.transformers.HtmlParserContentTransformer;
import org.alfresco.transformer.transformers.SelectingTransformer;
import org.alfresco.transformer.transformers.StringExtractingContentTransformer;
import org.alfresco.transformer.transformers.TextToPdfContentTransformer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.autoconfigure.web.servlet.WebMvcTest;
import org.springframework.context.annotation.Import;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.test.context.junit4.SpringRunner;
import org.springframework.test.web.servlet.MvcResult;
import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilder;
import java.io.IOException;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_IMAGE_JPEG;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_IWORK_KEYNOTE;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_IWORK_NUMBERS;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_IWORK_PAGES;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_OPENXML_WORDPROCESSING;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_PDF;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_TEXT_PLAIN;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.springframework.http.HttpStatus.OK;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.header;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
@RunWith(SpringRunner.class)
@WebMvcTest(MiscController.class)
@Import({SelectingTransformer.class})
public class MiscControllerTest extends AbstractTransformerControllerTest
{
@Autowired
private MiscController controller;
private String sourceEncoding = "UTF-8";
private String targetEncoding = "UTF-8";
private String targetMimetype = MIMETYPE_TEXT_PLAIN;
@Before
public void before() throws Exception
{
sourceMimetype = MIMETYPE_HTML;
sourceExtension = "html";
targetExtension = "txt";
expectedOptions = null;
expectedSourceSuffix = null;
expectedSourceFileBytes = readTestFile(sourceExtension);
expectedTargetFileBytes = readTestFile(targetExtension);
//expectedTargetFileBytes = null;
sourceFile = new MockMultipartFile("file", "quick." + sourceExtension, sourceMimetype, expectedSourceFileBytes);
}
@Override
protected void mockTransformCommand(String sourceExtension, String targetExtension, String sourceMimetype, boolean readTargetFileBytes) throws IOException
{
}
@Override
protected AbstractTransformerController getController()
{
return controller;
}
@Override
protected void updateTransformRequestWithSpecificOptions(TransformRequest transformRequest)
{
}
@Override
// Add extra required parameters to the request.
protected MockHttpServletRequestBuilder mockMvcRequest(String url, MockMultipartFile sourceFile, String... params)
{
return super.mockMvcRequest(url, sourceFile, params)
.param("targetEncoding", targetEncoding)
.param("sourceEncoding", sourceEncoding)
.param("targetMimetype", targetMimetype)
.param("sourceMimetype", sourceMimetype);
}
@Test
@Override
public void noTargetFileTest()
{
// Ignore the test in super class as the Misc transforms are real rather than mocked up.
// It is the mock that returns a zero length file for other transformers, when we supply an invalid targetExtension.
}
@Test
public void testHTMLtoString() throws Exception
{
final String NEWLINE = System.getProperty ("line.separator");
final String TITLE = "Testing!";
final String TEXT_P1 = "This is some text in English";
final String TEXT_P2 = "This is more text in English";
final String TEXT_P3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
String partA = "<html><head><title>" + TITLE + "</title></head>" + NEWLINE;
String partB = "<body><p>" + TEXT_P1 + "</p>" + NEWLINE +
"<p>" + TEXT_P2 + "</p>" + NEWLINE +
"<p>" + TEXT_P3 + "</p>" + NEWLINE;
String partC = "</body></html>";
final String expected = TITLE + NEWLINE + TEXT_P1 + NEWLINE + TEXT_P2 + NEWLINE + TEXT_P3 + NEWLINE;
MvcResult result = sendText("html",
"UTF-8",
MIMETYPE_HTML,
"txt",
MIMETYPE_TEXT_PLAIN,
"UTF-8",
expected.getBytes());
String contentResult = new String(result.getResponse().getContentAsByteArray(), targetEncoding);
assertTrue("The content did not include \""+expected, contentResult.contains(expected));
}
@Test
public void testStringtoString() throws Exception
{
String expected = null;
byte[] content = null;
try
{
content = "azAz10!<21>$%^&*()\t\r\n".getBytes("UTF-8");
expected = new String(content, "MacDingbat");
}
catch (UnsupportedEncodingException e)
{
throw new AlfrescoRuntimeException("Encoding not recognised", e);
}
MvcResult result = sendText("txt",
"MacDingbat",
MIMETYPE_TEXT_PLAIN,
"txt",
MIMETYPE_TEXT_PLAIN,
"UTF-8",
content);
String contentResult = new String(result.getResponse().getContentAsByteArray(), targetEncoding);
assertTrue("The content did not include \""+expected, contentResult.contains(expected));
}
@Test
public void textToPdf() throws Exception
{
StringBuilder sb = new StringBuilder();
String expected = null;
for (int i=1; i<=5; i++)
{
sb.append(i);
sb.append(" I must not talk in class or feed my homework to my cat.\n");
}
sb.append("\nBart\n");
expected = sb.toString();
MvcResult result = sendText("txt",
"UTF-8",
MIMETYPE_TEXT_PLAIN,
"pdf",
MIMETYPE_PDF,
"UTF-8",
expected.getBytes());
// Read back in the PDF and check it
PDDocument doc = PDDocument.load(result.getResponse().getContentAsByteArray());
PDFTextStripper textStripper = new PDFTextStripper();
StringWriter textWriter = new StringWriter();
textStripper.writeText(doc, textWriter);
doc.close();
expected = clean(expected);
String actual = clean(textWriter.toString());
assertEquals("The content did not match.", expected, actual);
}
@Test
public void testAppleIWorksPages() throws Exception
{
imageBasedTransform("pages", MIMETYPE_IWORK_PAGES, MIMETYPE_IMAGE_JPEG, "jpeg");
}
@Test
public void testAppleIWorksNumbers() throws Exception
{
imageBasedTransform("numbers", MIMETYPE_IWORK_NUMBERS, MIMETYPE_IMAGE_JPEG, "jpeg");
}
@Test
public void testAppleIWorksKey() throws Exception
{
imageBasedTransform("key", MIMETYPE_IWORK_KEYNOTE, MIMETYPE_IMAGE_JPEG, "jpeg");
}
// TODO Doesn't wotk with java 11, enable when fixed
// @Test
public void testOOXML() throws Exception
{
imageBasedTransform("docx", MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_IMAGE_JPEG, "jpeg");
}
private void imageBasedTransform(String sourceExtension, String sourceMimetype, String targetMimetype, String targetExtension) throws Exception
{
MockMultipartFile sourceFilex = new MockMultipartFile("file", "test_file." + sourceExtension, sourceMimetype, readTestFile(sourceExtension));
MockHttpServletRequestBuilder requestBuilder = super.mockMvcRequest("/transform", sourceFilex)
.param("targetExtension", "jpeg")
.param("targetMimetype", targetMimetype)
.param("sourceMimetype", sourceMimetype);
MvcResult result = mockMvc.perform(requestBuilder)
.andExpect(status().is(OK.value()))
.andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''test_file." + targetExtension))
.andReturn();
assertTrue("Expected image content but content is empty.",result.getResponse().getContentLengthLong() > 0L);
}
private MvcResult sendText(String sourceExtension,
String sourceEncoding,
String sourceMimetype,
String targetExtension,
String targetMimetype,
String targetEncoding,
byte[] content) throws Exception
{
MockMultipartFile sourceFilex = new MockMultipartFile("file", "test_file." + sourceExtension, sourceMimetype, content);
MockHttpServletRequestBuilder requestBuilder = super.mockMvcRequest("/transform", sourceFilex)
.param("targetExtension", targetExtension)
.param("targetEncoding", targetEncoding)
.param("targetMimetype", targetMimetype)
.param("sourceEncoding", sourceEncoding)
.param("sourceMimetype", sourceMimetype);
MvcResult result = mockMvc.perform(requestBuilder)
.andExpect(status().is(OK.value()))
.andExpect(header().string("Content-Disposition", "attachment; filename*= "+targetEncoding+"''test_file." + targetExtension)).
andReturn();
return result;
}
private String clean(String text)
{
text = text.replaceAll("\\s+\\r", "");
text = text.replaceAll("\\s+\\n", "");
text = text.replaceAll("\\r", "");
text = text.replaceAll("\\n", "");
return text;
}
}

View File

@@ -0,0 +1,56 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
import org.alfresco.transform.client.model.TransformRequest;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;
import java.util.UUID;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_TEXT_PLAIN;
@RunWith(SpringRunner.class)
@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
public class MiscQueueTransformServiceIT extends AbstractQueueTransformServiceIT
{
@Override
protected TransformRequest buildRequest()
{
return TransformRequest.builder()
.withRequestId(UUID.randomUUID().toString())
.withSourceMediaType(MIMETYPE_HTML)
.withTargetMediaType(MIMETYPE_TEXT_PLAIN)
.withTargetExtension("txt")
.withSchema(1)
.withClientData("ACS")
.withSourceReference(UUID.randomUUID().toString())
.withSourceSize(32L).build();
}
}

View File

@@ -0,0 +1,48 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;
@RunWith(SpringRunner.class)
@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
public class MiscTransformerHttpRequestTest extends AbstractHttpRequestTest
{
@Override
protected String getTransformerName()
{
return "Miscellaneous Transformers";
}
@Override
protected String getSourceExtension()
{
return "html";
}
}

View File

@@ -0,0 +1,173 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.Mock;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.test.context.junit4.SpringRunner;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.Map;
import static org.alfresco.transformer.transformers.StringExtractingContentTransformer.SOURCE_ENCODING;
import static org.alfresco.transformer.transformers.StringExtractingContentTransformer.TARGET_ENCODING;
import static org.junit.Assert.*;
@RunWith(SpringRunner.class)
@Import(HtmlParserContentTransformer.class)
public class HtmlParserContentTransformerTest
{
@Autowired
HtmlParserContentTransformer transformer;
/**
* Checks that we correctly handle text in different encodings,
* no matter if the encoding is specified on the Content Property
* or in a meta tag within the HTML itself. (ALF-10466)
*
* On Windows, org.htmlparser.beans.StringBean.carriageReturn() appends a new system dependent new line
* so we must be careful when checking the returned text
*/
@Test
public void testEncodingHandling() throws Exception
{
final String NEWLINE = System.getProperty ("line.separator");
final String TITLE = "Testing!";
final String TEXT_P1 = "This is some text in English";
final String TEXT_P2 = "This is more text in English";
final String TEXT_P3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
String partA = "<html><head><title>" + TITLE + "</title></head>" + NEWLINE;
String partB = "<body><p>" + TEXT_P1 + "</p>" + NEWLINE +
"<p>" + TEXT_P2 + "</p>" + NEWLINE +
"<p>" + TEXT_P3 + "</p>" + NEWLINE;
String partC = "</body></html>";
final String expected = TITLE + NEWLINE + TEXT_P1 + NEWLINE + TEXT_P2 + NEWLINE + TEXT_P3 + NEWLINE;
File tmpS = null;
File tmpD = null;
try
{
// Content set to ISO 8859-1
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
writeToFile(tmpS, partA+partB+partC, "ISO-8859-1");
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
Map<String, String> parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "ISO-8859-1");
transformer.transform(tmpS, tmpD, parameters);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
tmpD.delete();
// Content set to UTF-8
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
writeToFile(tmpS, partA+partB+partC, "UTF-8");
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "UTF-8");
transformer.transform(tmpS, tmpD, parameters);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
tmpD.delete();
// Content set to UTF-16
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
writeToFile(tmpS, partA+partB+partC, "UTF-16");
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "UTF-16");
transformer.transform(tmpS, tmpD, parameters);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
tmpD.delete();
// Note - since HTML Parser 2.0 META tags specifying the
// document encoding will ONLY be respected if the original
// content type was set to ISO-8859-1.
//
// This means there is now only one test which we can perform
// to ensure that this now-limited overriding of the encoding
// takes effect.
// Content set to ISO 8859-1, meta set to UTF-8
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
String str = partA+
"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">" +
partB+partC;
writeToFile(tmpS, str, "UTF-8");
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "ISO-8859-1");
transformer.transform(tmpS, tmpD, parameters);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
tmpD.delete();
// Note - we can't test UTF-16 with only a meta encoding,
// because without that the parser won't know about the
// 2 byte format so won't be able to identify the meta tag
}
finally
{
if (tmpS != null && tmpS.exists()) tmpS.delete();
if (tmpD != null && tmpD.exists()) tmpD.delete();
}
}
private void writeToFile(File file, String content, String encoding) throws Exception
{
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
{
ow.append(content);
}
}
private String readFromFile(File file, String encoding) throws Exception
{
String content = "wrong content";
content = new String(Files.readAllBytes(file.toPath()), encoding);
return content;
}
}

View File

@@ -0,0 +1,157 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.transformers;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.test.context.junit4.SpringRunner;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.Map;
import static org.alfresco.transformer.transformers.TextToPdfContentTransformer.PAGE_LIMIT;
import static org.junit.Assert.*;
@RunWith(SpringRunner.class)
@Import(TextToPdfContentTransformer.class)
public class TextToPdfContentTransformerTest
{
@Autowired
TextToPdfContentTransformer transformer;
@Before
public void setUp()
{
transformer.setStandardFont("Times-Roman");
transformer.setFontSize(20);
}
@Test
public void testUnlimitedPages() throws Exception
{
transformTextAndCheckPageLength(-1);
}
@Test
public void testLimitedTo1Page() throws Exception
{
transformTextAndCheckPageLength(1);
}
@Test
public void testLimitedTo2Pages() throws Exception
{
transformTextAndCheckPageLength(2);
}
@Test
public void testLimitedTo50Pages() throws Exception
{
transformTextAndCheckPageLength(50);
}
private void transformTextAndCheckPageLength(int pageLimit) throws Exception
{
int pageLength = 32;
int lines = (pageLength+10) * ((pageLimit > 0) ? pageLimit : 1);
StringBuilder sb = new StringBuilder();
String checkText = null;
int cutoff = pageLimit * pageLength;
for (int i=1; i<=lines; i++)
{
sb.append(i);
sb.append(" I must not talk in class or feed my homework to my cat.\n");
if (i == cutoff)
checkText = sb.toString();
}
sb.append("\nBart\n");
String text = sb.toString();
checkText = (checkText == null) ? clean(text) : clean(checkText);
transformTextAndCheck(text, "UTF-8", checkText, String.valueOf(pageLimit));
}
private void transformTextAndCheck(String text, String encoding, String checkText, String pageLimit) throws Exception
{
// Get a reader for the text
File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
writeToFile(sourceFile,text, encoding);
// And a temp writer
File targetFile = File.createTempFile("AlfrescoTestTarget_", ".pdf");
// Transform to PDF
Map<String, String> parameters = new HashMap<>();
parameters.put(PAGE_LIMIT, pageLimit);
transformer.transform(sourceFile, targetFile, parameters);
// Read back in the PDF and check it
PDDocument doc = PDDocument.load(targetFile);
PDFTextStripper textStripper = new PDFTextStripper();
StringWriter textWriter = new StringWriter();
textStripper.writeText(doc, textWriter);
doc.close();
String roundTrip = clean(textWriter.toString());
assertEquals(
"Incorrect text in PDF when starting from text in " + encoding,
checkText, roundTrip
);
sourceFile.delete();
targetFile.delete();
}
private String clean(String text)
{
text = text.replaceAll("\\s+\\r", "");
text = text.replaceAll("\\s+\\n", "");
text = text.replaceAll("\\r", "");
text = text.replaceAll("\\n", "");
return text;
}
private void writeToFile(File file, String content, String encoding) throws Exception
{
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
{
ow.append(content);
}
}
}

View File

@@ -0,0 +1,18 @@
The quick brown fox jumps over the lazy dog
The quick brown fox jumps over the lazy dog