diff --git a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java index 820d822b..8fa9a309 100644 --- a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java +++ b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java @@ -2,7 +2,7 @@ * #%L * Alfresco Transform Core * %% - * Copyright (C) 2005 - 2020 Alfresco Software Limited + * Copyright (C) 2005 - 2021 Alfresco Software Limited * %% * This file is part of the Alfresco software. * - @@ -280,7 +280,7 @@ public class AIOTransformRegistryTest int cutoff = pageLimit * pageLength; for (int i = 1; i <= lines; i++) { - sb.append(i); + sb.append(Integer.toString(i)); sb.append(" I must not talk in class or feed my homework to my cat.\n"); if (i == cutoff) checkText = sb.toString(); diff --git a/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java b/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java index 92cce075..89580f20 100644 --- a/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java +++ b/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java @@ -408,7 +408,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest StringBuilder sb = new StringBuilder(); for (int i = 1; i <= 5; i++) { - sb.append(i); + sb.append(Integer.toString(i)); sb.append(" I must not talk in class or feed my homework to my cat.\n"); } sb.append("\nBart\n"); diff --git a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java index afdbb413..1cf823bd 100644 --- a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java +++ b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java @@ -2,7 +2,7 @@ * #%L * Alfresco Transform Core * %% - * Copyright (C) 2005 - 2020 Alfresco Software Limited + * Copyright (C) 2005 - 2021 Alfresco Software Limited * %% * This file is part of the Alfresco software. * - @@ -44,6 +44,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; +import java.io.PushbackInputStream; import java.io.Reader; import java.nio.charset.Charset; import java.util.HashMap; @@ -63,6 +64,10 @@ public class TextToPdfContentTransformer implements SelectableTransformer { private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class); + private static final int UTF16_READ_AHEAD_BYTES = 16; // 8 characters including BOM if it exists + private static final byte FE = (byte) 0xFE; + private static final byte FF = (byte) 0xFF; + public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT; private final PagedTextToPDF transformer; @@ -146,7 +151,129 @@ public class TextToPdfContentTransformer implements SelectableTransformer } if (charset != null) { - logger.debug("Processing plain text in encoding " + charset.displayName()); + // Handles the situation where there is a BOM even though the encoding indicates that normally + // there should not be one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too + // which optionally has the BOM. Rather than look at the BOM we look at the number of zero bytes + // in the first few character. XML files even when not in European languages tend to have more + // even zero bytes when big-endian encoded and more odd zero bytes when little-endian. + // Think of: The normal Java decoder does not have this flexibility but + // other transformers do. + String name = charset.displayName(); + if ("UTF-16".equals(name) || "UTF-16BE".equals(name) || "UTF-16LE".equals(name)) + { + logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name); + charset = Charset.forName("UTF-16"); + is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES) + { + boolean bomRead; + boolean switchByteOrder; + boolean evenByte = true; + + @Override + public int read(byte[] bytes, int off, int len) throws IOException + { + int i = 0; + int b = 0; + for (; i oddZeros) + { + if (bytes[0] == FF && bytes[1] == FE) + { + switchByteOrder = true; + switchBom = true; + logger.warn("Little-endian BOM FFFE read, but characters are big-endian"); + } + else + { + logger.debug("More even zero bytes, so normal read for big-endian"); + } + } + else + { + if (bytes[0] == FE && bytes[1] == FF) + { + switchBom = true; + logger.debug("Big-endian BOM FEFF read, but characters are little-endian"); + } + else + { + switchByteOrder = true; + logger.debug("More odd zero bytes, so switch bytes from little-endian"); + } + } + + if (switchBom) + { + byte b = bytes[0]; + bytes[0] = bytes[1]; + bytes[1] = b; + } + + for (int i = end-1; i>=0; i--) + { + unread(bytes[i]); + } + } + + if (switchByteOrder) + { + if (evenByte) + { + int b1 = super.read(); + int b2 = super.read(); + if (b1 != -1) + { + unread(b1); + } + if (b2 != -1) + { + unread(b2); + } + } + evenByte = !evenByte; + } + + return super.read(); + } + + // Counts the number of even or odd 00 bytes + private int countZeros(byte[] b, int offset) + { + int count = 0; + for (int i=offset; i 0 && (pageCount++ >= pageLimit)) { -// pageLimits.getAction().throwIOExceptionIfRequired("Page limit ("+pageLimit+ -// ") reached.", transformerDebug); break outer; } @@ -272,7 +395,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer y = page.getMediaBox().getHeight() - margin + height; contentStream.moveTextPositionByAmount(margin, y); } - //System.out.println( "Drawing string at " + x + "," + y ); if (contentStream == null) { diff --git a/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java b/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java index 7de1e75a..5fa08d79 100644 --- a/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java +++ b/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java @@ -2,7 +2,7 @@ * #%L * Alfresco Transform Core * %% - * Copyright (C) 2005 - 2020 Alfresco Software Limited + * Copyright (C) 2005 - 2021 Alfresco Software Limited * %% * This file is part of the Alfresco software. * - @@ -31,14 +31,20 @@ import org.apache.pdfbox.text.PDFTextStripper; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.StringWriter; import java.util.HashMap; import java.util.Map; import static org.alfresco.transformer.util.RequestParamMap.PAGE_LIMIT; +import static org.alfresco.transformer.util.RequestParamMap.SOURCE_ENCODING; import static org.junit.jupiter.api.Assertions.assertEquals; public class TextToPdfContentTransformerTest @@ -76,39 +82,125 @@ public class TextToPdfContentTransformerTest transformTextAndCheckPageLength(50); } - private void transformTextAndCheckPageLength(int pageLimit) throws Exception + @Test + public void test1UTF16BigEndianBomBigEndianChars() throws Exception + { + // 1. BOM indicates BE (fe then ff) + chars appear to be BE (as first byte read tends to be a zero) + // Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE + String expectedByteOrder = "fe ff 00 31 00 20 00 49"; + transformTextAndCheck("UTF-16", true, true, expectedByteOrder); + transformTextAndCheck("UTF-16", true, true, expectedByteOrder); + transformTextAndCheck("UTF-16BE", true, true, expectedByteOrder); + transformTextAndCheck("UTF-16LE", true, true, expectedByteOrder); + } + + @Test + public void test2UTF16LittleEndianBomLittleEndianChars() throws Exception + { + // 2. BOM indicates LE (ff then fe) + chars appear to be LE (as second byte read tends to be a zero) + // Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE + transformTextAndCheck("UTF-16", false, true, "ff fe 31 00 20 00 49 00"); + } + + @Test + public void test3UTF16NoBomBigEndianChars() throws Exception + { + // 3. No BOM + chars appear to be BE (as first byte read tends to be a zero) + // Expected with UTF-16BE + transformTextAndCheck("UTF-16", true, null, "00 31 00 20 00 49"); + } + + @Test + public void test4UTF16NoBomLittleEndianChars() throws Exception + { + // 4. No BOM + chars appear to be LE (as second byte read tends to be a zero) + // Expected with UTF-16LE + transformTextAndCheck("UTF-16", false, null, "31 00 20 00 49 00"); + } + + @Test + public void test5UTF16BigEndianBomLittleEndianChars() throws Exception + { + // 5. BOM indicates BE (fe then ff) + chars appear to be LE (as second byte read tends to be a zero) + // SOMETHING IS WRONG, BUT USE LE!!!! + transformTextAndCheck("UTF-16", false, false, "fe ff 31 00 20 00 49 00"); + } + + @Test + public void test6UTF16LittleEndianBomBigEndianChars() throws Exception + { + // 6. BOM indicates LE (ff then fe) + chars appear to be BE (as first byte read tends to be a zero) + // SOMETHING IS WRONG, BUT USE BE!!!! + transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49"); + } + + /** + * @param encoding to be used to read the source file + * @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of + * each char is a zero when using English. + * @param validBom if not null, the BOM is included. If true it is the one matching bigEndian. If false it is the + * opposite byte order, which really is an error, but we try to recover from it. + * @param expectedByteOrder The first few bytes of the source file so we can check the test data has been + * correctly created. + */ + protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom, + String expectedByteOrder) throws Exception + { + transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder); + } + + protected void transformTextAndCheckPageLength(int pageLimit) throws Exception + { + transformTextAndCheckImpl(pageLimit, "UTF-8", null, null, null); + } + + private void transformTextAndCheckImpl(int pageLimit, String encoding, Boolean bigEndian, Boolean validBom, + String expectedByteOrder) throws Exception + { + StringBuilder sb = new StringBuilder(); + String checkText = createTestText(pageLimit, sb); + String text = sb.toString(); + + File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt"); + writeToFile(sourceFile, text, encoding, bigEndian, validBom); + checkFileBytes(sourceFile, expectedByteOrder); + + transformTextAndCheck(sourceFile, encoding, checkText, String.valueOf(pageLimit)); + } + + private String createTestText(int pageLimit, StringBuilder sb) { int pageLength = 32; int lines = (pageLength + 10) * ((pageLimit > 0) ? pageLimit : 1); - StringBuilder sb = new StringBuilder(); String checkText = null; int cutoff = pageLimit * pageLength; for (int i = 1; i <= lines; i++) { - sb.append(i); + sb.append(Integer.toString(i)); sb.append(" I must not talk in class or feed my homework to my cat.\n"); if (i == cutoff) + { checkText = sb.toString(); + } } sb.append("\nBart\n"); + String text = sb.toString(); - checkText = (checkText == null) ? clean(text) : clean(checkText); - transformTextAndCheck(text, "UTF-8", checkText, String.valueOf(pageLimit)); + checkText = checkText == null ? clean(text) : clean(checkText); + + return checkText; } - private void transformTextAndCheck(String text, String encoding, String checkText, + private void transformTextAndCheck(File sourceFile, String encoding, String checkText, String pageLimit) throws Exception { - // Get a reader for the text - File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt"); - writeToFile(sourceFile, text, encoding); - // And a temp writer File targetFile = File.createTempFile("AlfrescoTestTarget_", ".pdf"); // Transform to PDF Map parameters = new HashMap<>(); parameters.put(PAGE_LIMIT, pageLimit); + parameters.put(SOURCE_ENCODING, encoding); transformer.transform("text/plain", "application/pdf", parameters, sourceFile, targetFile); // Read back in the PDF and check it @@ -138,11 +230,123 @@ public class TextToPdfContentTransformerTest return text; } - private void writeToFile(File file, String content, String encoding) throws Exception + private void writeToFile(File file, String content, String encoding, Boolean bigEndian, Boolean validBom) throws Exception { + // If we may have to change the endian or include/exclude the BOM, write initially to a tmp file using + // UTF-16 which includes the BOM FEFF. + File originalFile = file; + if (bigEndian != null) + { + file = File.createTempFile("AlfrescoTestTmpSrc_", ".txt"); + encoding = "UTF-16"; + } + + // Use a writer to use the required encoding try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding)) { ow.append(content); } + + // If we may have to change the endian or include/exclude the BOM, copy the raw bytes to the supplied file + if (bigEndian != null) + { + boolean firstRead = true; + byte[] bytes = new byte[8192]; + try (InputStream is = new BufferedInputStream(new FileInputStream(file)); + OutputStream os = new BufferedOutputStream(new FileOutputStream(originalFile))) + { + int l; + int off; + boolean switchBytes = false; + do + { + l = is.read(bytes); + off = 0; + // When we read the first block, change the offset if we don't want the BOM and also work out + // if the byte endian need to be switch. The source bytes allways start with a standard BOM. + if (firstRead) + { + firstRead = false; + boolean actualEndianBytes = bytes[0] == (byte)0xfe; // if true [1] would also be 0xff + switchBytes = actualEndianBytes != bigEndian; + if (validBom == null) + { + // Strip the BOM + off = 2; + } + else if (!validBom) + { + // Reverse the BOM so it does not match the characters! + byte aByte = bytes[0]; + bytes[0] = bytes[1]; + bytes[1] = aByte; + } + } + int len = l - off; + if (len > 0) + { + if (switchBytes) + { + // Reverse the byte order of characters including the BOM. + for (int i=0; i 0) + { + sb.append(' '); + } + sb.append(Character.forDigit((bytes[i] >> 4) & 0xF, 16)); + sb.append(Character.forDigit((bytes[i] & 0xF), 16)); + } + return sb.toString(); } }