MNT-20626 XML files having UTF-16LE and UTF-16BE can't be previewed (#331)

* Introduce more flexible reading of UTF-16 data, where there may be a BOM, but the spec says there should not be one, or the BOM is clearly wrong when looking at the following characters. The https://en.wikipedia.org/wiki/UTF-16 write up is nice and clear. * Includes identical correction in data setup in AIOTransformRegistryTest and MicsControllerTest for a problem found in TextToPdfContentTransformerTest. * Includes upgrade to latest pdfbox: 2.0.22
2025-09-17 14:21:18 +00:00 · 2021-02-09 19:04:34 +00:00
parent 97b9fc39cf
commit 2766c23431
4 changed files with 349 additions and 23 deletions
--- a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java
+++ b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java
@@ -2,7 +2,7 @@
 * #%L
 * Alfresco Transform Core
 * %%
- * Copyright (C) 2005 - 2020 Alfresco Software Limited
+ * Copyright (C) 2005 - 2021 Alfresco Software Limited
 * %%
 * This file is part of the Alfresco software.
 * -
@@ -280,7 +280,7 @@ public class AIOTransformRegistryTest
        int cutoff = pageLimit * pageLength;
        for (int i = 1; i <= lines; i++)
        {
-            sb.append(i);
+            sb.append(Integer.toString(i));
            sb.append(" I must not talk in class or feed my homework to my cat.\n");
            if (i == cutoff)
                checkText = sb.toString();
--- a/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java
+++ b/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java
@@ -408,7 +408,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
        StringBuilder sb = new StringBuilder();
        for (int i = 1; i <= 5; i++)
        {
-            sb.append(i);
+            sb.append(Integer.toString(i));
            sb.append(" I must not talk in class or feed my homework to my cat.\n");
        }
        sb.append("\nBart\n");
--- a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java
+++ b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java
@@ -2,7 +2,7 @@
 * #%L
 * Alfresco Transform Core
 * %%
- * Copyright (C) 2005 - 2020 Alfresco Software Limited
+ * Copyright (C) 2005 - 2021 Alfresco Software Limited
 * %%
 * This file is part of the Alfresco software.
 * -
@@ -44,6 +44,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.io.PushbackInputStream;
 import java.io.Reader;
 import java.nio.charset.Charset;
 import java.util.HashMap;
@@ -63,6 +64,10 @@ public class TextToPdfContentTransformer implements SelectableTransformer
 {
    private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class);
    private static final int UTF16_READ_AHEAD_BYTES = 16; // 8 characters including BOM if it exists
    private static final byte FE = (byte) 0xFE;
    private static final byte FF = (byte) 0xFF;
    public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
    private final PagedTextToPDF transformer;
@@ -146,7 +151,129 @@ public class TextToPdfContentTransformer implements SelectableTransformer
            }
            if (charset != null)
            {
-                logger.debug("Processing plain text in encoding " + charset.displayName());
+                // Handles the situation where there is a BOM even though the encoding indicates that normally
                // there should not be one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too
                // which optionally has the BOM. Rather than look at the BOM we look at the number of zero bytes
                // in the first few character. XML files even when not in European languages tend to have more
                // even zero bytes when big-endian encoded and more odd zero bytes when little-endian.
                // Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but
                // other transformers do.
                String name = charset.displayName();
                if ("UTF-16".equals(name) || "UTF-16BE".equals(name) || "UTF-16LE".equals(name))
                {
                    logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
                    charset = Charset.forName("UTF-16");
                    is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
                    {
                        boolean bomRead;
                        boolean switchByteOrder;
                        boolean evenByte = true;
                        @Override
                        public int read(byte[] bytes, int off, int len) throws IOException
                        {
                            int i = 0;
                            int b = 0;
                            for (; i<len; i++)
                            {
                                b = read();
                                if (b == -1)
                                {
                                    break;
                                }
                                bytes[off+i] = (byte)b;
                            }
                            return i == 0 && b == -1 ? -1 : i;
                        }
                        @Override
                        public int read() throws IOException
                        {
                            if (!bomRead)
                            {
                                bomRead = true;
                                boolean switchBom = false;
                                byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
                                int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
                                int evenZeros = countZeros(bytes, 0);
                                int oddZeros = countZeros(bytes, 1);
                                if (evenZeros > oddZeros)
                                {
                                    if (bytes[0] == FF && bytes[1] == FE)
                                    {
                                        switchByteOrder = true;
                                        switchBom = true;
                                        logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
                                    }
                                    else
                                    {
                                        logger.debug("More even zero bytes, so normal read for big-endian");
                                    }
                                }
                                else
                                {
                                    if (bytes[0] == FE && bytes[1] == FF)
                                    {
                                        switchBom = true;
                                        logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
                                    }
                                    else
                                    {
                                        switchByteOrder = true;
                                        logger.debug("More odd zero bytes, so switch bytes from little-endian");
                                    }
                                }
                                if (switchBom)
                                {
                                    byte b = bytes[0];
                                    bytes[0] = bytes[1];
                                    bytes[1] = b;
                                }
                                for (int i = end-1; i>=0; i--)
                                {
                                    unread(bytes[i]);
                                }
                            }
                            if (switchByteOrder)
                            {
                                if (evenByte)
                                {
                                    int b1 = super.read();
                                    int b2 = super.read();
                                    if (b1 != -1)
                                    {
                                        unread(b1);
                                    }
                                    if (b2 != -1)
                                    {
                                        unread(b2);
                                    }
                                }
                                evenByte = !evenByte;
                            }
                            return super.read();
                        }
                        // Counts the number of even or odd 00 bytes
                        private int countZeros(byte[] b, int offset)
                        {
                            int count = 0;
                            for (int i=offset; i<UTF16_READ_AHEAD_BYTES; i+=2)
                            {
                                if (b[i] == 0)
                                {
                                    count++;
                                }
                            }
                            return count;
                        }
                    };
                }
                logger.debug("Processing plain text in encoding " + name);
                return new InputStreamReader(is, charset);
            }
        }
@@ -196,7 +323,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
        public PDDocument createPDFFromText(Reader text, int pageLimit)
            throws IOException
        {
            //int pageLimit = (int)pageLimits.getValue();
            PDDocument doc = null;
            int pageCount = 0;
            try
@@ -207,7 +333,7 @@ public class TextToPdfContentTransformer implements SelectableTransformer
                //calculate font height and increase by 5 percent.
                height = height * getFontSize() * 1.05f;
                doc = new PDDocument();
-                BufferedReader data = new BufferedReader(text);
+                BufferedReader data = (text instanceof BufferedReader) ? (BufferedReader) text : new BufferedReader(text);
                String nextLine;
                PDPage page = new PDPage();
                PDPageContentStream contentStream = null;
@@ -220,7 +346,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
                outer:
                while ((nextLine = data.readLine()) != null)
                {
                    // The input text is nonEmpty. New pages will be created and added
                    // to the PDF document as they are needed, depending on the length of
                    // the text.
@@ -252,8 +377,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
                            int test = pageCount + 1;
                            if (pageLimit > 0 && (pageCount++ >= pageLimit))
                            {
 //                                pageLimits.getAction().throwIOExceptionIfRequired("Page limit ("+pageLimit+
 //                                        ") reached.", transformerDebug);
                                break outer;
                            }
@@ -272,7 +395,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
                            y = page.getMediaBox().getHeight() - margin + height;
                            contentStream.moveTextPositionByAmount(margin, y);
                        }
                        //System.out.println( "Drawing string at " + x + "," + y );
                        if (contentStream == null)
                        {
--- a/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java
+++ b/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java
@@ -2,7 +2,7 @@
 * #%L
 * Alfresco Transform Core
 * %%
- * Copyright (C) 2005 - 2020 Alfresco Software Limited
+ * Copyright (C) 2005 - 2021 Alfresco Software Limited
 * %%
 * This file is part of the Alfresco software.
 * -
@@ -31,14 +31,20 @@ import org.apache.pdfbox.text.PDFTextStripper;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.StringWriter;
 import java.util.HashMap;
 import java.util.Map;
 import static org.alfresco.transformer.util.RequestParamMap.PAGE_LIMIT;
 import static org.alfresco.transformer.util.RequestParamMap.SOURCE_ENCODING;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 public class TextToPdfContentTransformerTest
@@ -76,39 +82,125 @@ public class TextToPdfContentTransformerTest
        transformTextAndCheckPageLength(50);
    }
-    private void transformTextAndCheckPageLength(int pageLimit) throws Exception
+    @Test
    public void test1UTF16BigEndianBomBigEndianChars() throws Exception
    {
        // 1. BOM indicates BE (fe then ff) + chars appear to be BE (as first byte read tends to be a zero)
        //    Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE
        String expectedByteOrder = "fe ff 00 31 00 20 00 49";
        transformTextAndCheck("UTF-16", true, true, expectedByteOrder);
        transformTextAndCheck("UTF-16", true, true, expectedByteOrder);
        transformTextAndCheck("UTF-16BE", true, true, expectedByteOrder);
        transformTextAndCheck("UTF-16LE", true, true, expectedByteOrder);
    }
    @Test
    public void test2UTF16LittleEndianBomLittleEndianChars() throws Exception
    {
        // 2. BOM indicates LE (ff then fe) + chars appear to be LE (as second byte read tends to be a zero)
        //    Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE
        transformTextAndCheck("UTF-16", false, true, "ff fe 31 00 20 00 49 00");
    }
    @Test
    public void test3UTF16NoBomBigEndianChars() throws Exception
    {
        // 3. No BOM + chars appear to be BE (as first byte read tends to be a zero)
        //    Expected with UTF-16BE
        transformTextAndCheck("UTF-16", true, null, "00 31 00 20 00 49");
    }
    @Test
    public void test4UTF16NoBomLittleEndianChars() throws Exception
    {
        // 4. No BOM + chars appear to be LE (as second byte read tends to be a zero)
        //    Expected with UTF-16LE
        transformTextAndCheck("UTF-16", false, null, "31 00 20 00 49 00");
    }
    @Test
    public void test5UTF16BigEndianBomLittleEndianChars() throws Exception
    {
        // 5. BOM indicates BE (fe then ff) + chars appear to be LE (as second byte read tends to be a zero)
        //    SOMETHING IS WRONG, BUT USE LE!!!!
        transformTextAndCheck("UTF-16", false, false, "fe ff 31 00 20 00 49 00");
    }
    @Test
    public void test6UTF16LittleEndianBomBigEndianChars() throws Exception
    {
        // 6. BOM indicates LE (ff then fe) + chars appear to be BE (as first byte read tends to be a zero)
        //    SOMETHING IS WRONG, BUT USE BE!!!!
        transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
    }
    /**
     * @param encoding to be used to read the source file
     * @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
     *                 each char is a zero when using English.
     * @param validBom if not null, the BOM is included. If true it is the one matching bigEndian. If false it is the
     *                 opposite byte order, which really is an error, but we try to recover from it.
     * @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
     *                 correctly created.
     */
    protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom,
                                         String expectedByteOrder) throws Exception
    {
        transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
    }
    protected void transformTextAndCheckPageLength(int pageLimit) throws Exception
    {
        transformTextAndCheckImpl(pageLimit, "UTF-8", null, null, null);
    }
    private void transformTextAndCheckImpl(int pageLimit, String encoding, Boolean bigEndian, Boolean validBom,
                                           String expectedByteOrder) throws Exception
    {
        StringBuilder sb = new StringBuilder();
        String checkText = createTestText(pageLimit, sb);
        String text = sb.toString();
        File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
        writeToFile(sourceFile, text, encoding, bigEndian, validBom);
        checkFileBytes(sourceFile, expectedByteOrder);
        transformTextAndCheck(sourceFile, encoding, checkText, String.valueOf(pageLimit));
    }
    private String createTestText(int pageLimit, StringBuilder sb)
    {
        int pageLength = 32;
        int lines = (pageLength + 10) * ((pageLimit > 0) ? pageLimit : 1);
        StringBuilder sb = new StringBuilder();
        String checkText = null;
        int cutoff = pageLimit * pageLength;
        for (int i = 1; i <= lines; i++)
        {
-            sb.append(i);
+            sb.append(Integer.toString(i));
            sb.append(" I must not talk in class or feed my homework to my cat.\n");
            if (i == cutoff)
            {
                checkText = sb.toString();
            }
        }
        sb.append("\nBart\n");
        String text = sb.toString();
-        checkText = (checkText == null) ? clean(text) : clean(checkText);
+        checkText = checkText == null ? clean(text) : clean(checkText);
-        transformTextAndCheck(text, "UTF-8", checkText, String.valueOf(pageLimit));
+
        return checkText;
    }
-    private void transformTextAndCheck(String text, String encoding, String checkText,
+    private void transformTextAndCheck(File sourceFile, String encoding, String checkText,
        String pageLimit) throws Exception
    {
        // Get a reader for the text
        File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
        writeToFile(sourceFile, text, encoding);
        // And a temp writer
        File targetFile = File.createTempFile("AlfrescoTestTarget_", ".pdf");
        // Transform to PDF
        Map<String, String> parameters = new HashMap<>();
        parameters.put(PAGE_LIMIT, pageLimit);
        parameters.put(SOURCE_ENCODING, encoding);
        transformer.transform("text/plain", "application/pdf", parameters, sourceFile, targetFile);
        // Read back in the PDF and check it
@@ -138,11 +230,123 @@ public class TextToPdfContentTransformerTest
        return text;
    }
-    private void writeToFile(File file, String content, String encoding) throws Exception
+    private void writeToFile(File file, String content, String encoding, Boolean bigEndian, Boolean validBom) throws Exception
    {
        // If we may have to change the endian or include/exclude the BOM, write initially to a tmp file using
        // UTF-16 which includes the BOM FEFF.
        File originalFile = file;
        if (bigEndian != null)
        {
            file = File.createTempFile("AlfrescoTestTmpSrc_", ".txt");
            encoding = "UTF-16";
        }
        // Use a writer to use the required encoding
        try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
        {
            ow.append(content);
        }
        // If we may have to change the endian or include/exclude the BOM, copy the raw bytes to the supplied file
        if (bigEndian != null)
        {
            boolean firstRead = true;
            byte[] bytes = new byte[8192];
            try (InputStream is = new BufferedInputStream(new FileInputStream(file));
                 OutputStream os = new BufferedOutputStream(new FileOutputStream(originalFile)))
            {
                int l;
                int off;
                boolean switchBytes = false;
                do
                {
                    l = is.read(bytes);
                    off = 0;
                    // When we read the first block, change the offset if we don't want the BOM and also work out
                    // if the byte endian need to be switch. The source bytes allways start with a standard BOM.
                    if (firstRead)
                    {
                        firstRead = false;
                        boolean actualEndianBytes = bytes[0] == (byte)0xfe; // if true [1] would also be 0xff
                        switchBytes = actualEndianBytes != bigEndian;
                        if (validBom == null)
                        {
                            // Strip the BOM
                            off = 2;
                        }
                        else if (!validBom)
                        {
                            // Reverse the BOM so it does not match the characters!
                            byte aByte = bytes[0];
                            bytes[0] = bytes[1];
                            bytes[1] = aByte;
                        }
                    }
                    int len = l - off;
                    if (len > 0)
                    {
                        if (switchBytes)
                        {
                            // Reverse the byte order of characters including the BOM.
                            for (int i=0; i<l; i+=2)
                            {
                                byte aByte = bytes[i];
                                bytes[i] = bytes[i+1];
                                bytes[i+1] = aByte;
                            }
                        }
                        os.write(bytes, off, len-off);
                    }
                } while (l != -1);
            }
        }
    }
    /**
     * Check the first few bytes in the source file match what we planed to use later as test data.
     */
    private void checkFileBytes(File sourceFile, String expectedByteOrder) throws Exception
    {
        if (expectedByteOrder != null)
        {
            byte[] expectedBytes = hexToBytes(expectedByteOrder); // new BigInteger(expectedByteOrder,16).toByteArray();
            int l = expectedBytes.length;
            byte[] actualBytes = new byte[l];
            FileInputStream is = new FileInputStream(sourceFile);
            is.read(actualBytes, 0, l);
            String actualByteOrder = bytesToHex(actualBytes);
            assertEquals(expectedByteOrder, actualByteOrder, "The sourceFile does not contain the expected bytes");
        }
    }
    private byte[] hexToBytes(String hexString)
    {
        hexString = hexString.replaceAll(" *", "");
        int len = hexString.length() / 2;
        byte[] bytes = new byte[len];
        for (int j=0, i=0; i<len; i++)
        {
            int firstDigit = Character.digit(hexString.charAt(j++), 16);
            int secondDigit = Character.digit(hexString.charAt(j++), 16);
            bytes[i] = (byte)((firstDigit << 4) + secondDigit);
        }
        return bytes;
    }
    private String bytesToHex(byte[] bytes)
    {
        StringBuffer sb = new StringBuffer();
        int len = bytes.length;
        for (int i=0; i<len; i++)
        {
            if (sb.length() > 0)
            {
                sb.append(' ');
            }
            sb.append(Character.forDigit((bytes[i] >> 4) & 0xF, 16));
            sb.append(Character.forDigit((bytes[i] & 0xF), 16));
        }
        return sb.toString();
    }
 }