MNT-20626 XML files having UTF-16LE and UTF-16BE can't be previewed (#331)

* Introduce more flexible reading of UTF-16 data, where there may be a BOM, but the spec says there should not be one, or the BOM is clearly wrong when looking at the following characters. The https://en.wikipedia.org/wiki/UTF-16 write up is nice and clear. * Includes identical correction in data setup in AIOTransformRegistryTest and MicsControllerTest for a problem found in TextToPdfContentTransformerTest. * Includes upgrade to latest pdfbox: 2.0.22
2025-09-17 14:21:18 +00:00 · 2021-02-09 19:04:34 +00:00
parent 97b9fc39cf
commit 2766c23431
4 changed files with 349 additions and 23 deletions
--- a/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java
+++ b/alfresco-transform-core-aio/alfresco-transform-core-aio/src/test/java/org/alfresco/transformer/AIOTransformRegistryTest.java
@@ -2,7 +2,7 @@
 * #%L
 * Alfresco Transform Core
 * %%
- * Copyright (C) 2005 - 2020 Alfresco Software Limited
+ * Copyright (C) 2005 - 2021 Alfresco Software Limited
 * %%
 * This file is part of the Alfresco software.
 * -
@@ -280,7 +280,7 @@ public class AIOTransformRegistryTest
        int cutoff = pageLimit * pageLength;
        for (int i = 1; i <= lines; i++)
        {
-            sb.append(i);
+            sb.append(Integer.toString(i));
            sb.append(" I must not talk in class or feed my homework to my cat.\n");
            if (i == cutoff)
                checkText = sb.toString();
--- a/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java
+++ b/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscControllerTest.java
@@ -408,7 +408,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
        StringBuilder sb = new StringBuilder();
        for (int i = 1; i <= 5; i++)
        {
-            sb.append(i);
+            sb.append(Integer.toString(i));
            sb.append(" I must not talk in class or feed my homework to my cat.\n");
        }
        sb.append("\nBart\n");
--- a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java
+++ b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java
@@ -2,7 +2,7 @@
 * #%L
 * Alfresco Transform Core
 * %%
- * Copyright (C) 2005 - 2020 Alfresco Software Limited
+ * Copyright (C) 2005 - 2021 Alfresco Software Limited
 * %%
 * This file is part of the Alfresco software.
 * -
@@ -44,6 +44,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStream;
+import java.io.PushbackInputStream;
 import java.io.Reader;
 import java.nio.charset.Charset;
 import java.util.HashMap;
@@ -63,6 +64,10 @@ public class TextToPdfContentTransformer implements SelectableTransformer
 {
    private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class);

+    private static final int UTF16_READ_AHEAD_BYTES = 16; // 8 characters including BOM if it exists
+    private static final byte FE = (byte) 0xFE;
+    private static final byte FF = (byte) 0xFF;
+
    public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;

    private final PagedTextToPDF transformer;
@@ -146,7 +151,129 @@ public class TextToPdfContentTransformer implements SelectableTransformer
            }
            if (charset != null)
            {
-                logger.debug("Processing plain text in encoding " + charset.displayName());
+                // Handles the situation where there is a BOM even though the encoding indicates that normally
+                // there should not be one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too
+                // which optionally has the BOM. Rather than look at the BOM we look at the number of zero bytes
+                // in the first few character. XML files even when not in European languages tend to have more
+                // even zero bytes when big-endian encoded and more odd zero bytes when little-endian.
+                // Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but
+                // other transformers do.
+                String name = charset.displayName();
+                if ("UTF-16".equals(name) || "UTF-16BE".equals(name) || "UTF-16LE".equals(name))
+                {
+                    logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
+                    charset = Charset.forName("UTF-16");
+                    is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
+                    {
+                        boolean bomRead;
+                        boolean switchByteOrder;
+                        boolean evenByte = true;
+
+                        @Override
+                        public int read(byte[] bytes, int off, int len) throws IOException
+                        {
+                            int i = 0;
+                            int b = 0;
+                            for (; i<len; i++)
+                            {
+                                b = read();
+                                if (b == -1)
+                                {
+                                    break;
+                                }
+                                bytes[off+i] = (byte)b;
+                            }
+                            return i == 0 && b == -1 ? -1 : i;
+                        }
+
+                        @Override
+                        public int read() throws IOException
+                        {
+                            if (!bomRead)
+                            {
+                                bomRead = true;
+                                boolean switchBom = false;
+                                byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
+                                int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
+                                int evenZeros = countZeros(bytes, 0);
+                                int oddZeros = countZeros(bytes, 1);
+                                if (evenZeros > oddZeros)
+                                {
+                                    if (bytes[0] == FF && bytes[1] == FE)
+                                    {
+                                        switchByteOrder = true;
+                                        switchBom = true;
+                                        logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
+                                    }
+                                    else
+                                    {
+                                        logger.debug("More even zero bytes, so normal read for big-endian");
+                                    }
+                                }
+                                else
+                                {
+                                    if (bytes[0] == FE && bytes[1] == FF)
+                                    {
+                                        switchBom = true;
+                                        logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
+                                    }
+                                    else
+                                    {
+                                        switchByteOrder = true;
+                                        logger.debug("More odd zero bytes, so switch bytes from little-endian");
+                                    }
+                                }
+
+                                if (switchBom)
+                                {
+                                    byte b = bytes[0];
+                                    bytes[0] = bytes[1];
+                                    bytes[1] = b;
+                                }
+
+                                for (int i = end-1; i>=0; i--)
+                                {
+                                    unread(bytes[i]);
+                                }
+                            }
+
+                            if (switchByteOrder)
+                            {
+                                if (evenByte)
+                                {
+                                    int b1 = super.read();
+                                    int b2 = super.read();
+                                    if (b1 != -1)
+                                    {
+                                        unread(b1);
+                                    }
+                                    if (b2 != -1)
+                                    {
+                                        unread(b2);
+                                    }
+                                }
+                                evenByte = !evenByte;
+                            }
+
+                            return super.read();
+                        }
+
+                        // Counts the number of even or odd 00 bytes
+                        private int countZeros(byte[] b, int offset)
+                        {
+                            int count = 0;
+                            for (int i=offset; i<UTF16_READ_AHEAD_BYTES; i+=2)
+                            {
+                                if (b[i] == 0)
+                                {
+                                    count++;
+                                }
+                            }
+                            return count;
+                        }
+                    };
+                }
+                logger.debug("Processing plain text in encoding " + name);
                return new InputStreamReader(is, charset);
            }
        }
@@ -196,7 +323,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
        public PDDocument createPDFFromText(Reader text, int pageLimit)
            throws IOException
        {
-            //int pageLimit = (int)pageLimits.getValue();
            PDDocument doc = null;
            int pageCount = 0;
            try
@@ -207,7 +333,7 @@ public class TextToPdfContentTransformer implements SelectableTransformer
                //calculate font height and increase by 5 percent.
                height = height * getFontSize() * 1.05f;
                doc = new PDDocument();
-                BufferedReader data = new BufferedReader(text);
+                BufferedReader data = (text instanceof BufferedReader) ? (BufferedReader) text : new BufferedReader(text);
                String nextLine;
                PDPage page = new PDPage();
                PDPageContentStream contentStream = null;
@@ -220,7 +346,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
                outer:
                while ((nextLine = data.readLine()) != null)
                {
-
                    // The input text is nonEmpty. New pages will be created and added
                    // to the PDF document as they are needed, depending on the length of
                    // the text.
@@ -252,8 +377,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
                            int test = pageCount + 1;
                            if (pageLimit > 0 && (pageCount++ >= pageLimit))
                            {
-//                                pageLimits.getAction().throwIOExceptionIfRequired("Page limit ("+pageLimit+
-//                                        ") reached.", transformerDebug);
                                break outer;
                            }

@@ -272,7 +395,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
                            y = page.getMediaBox().getHeight() - margin + height;
                            contentStream.moveTextPositionByAmount(margin, y);
                        }
-                        //System.out.println( "Drawing string at " + x + "," + y );

                        if (contentStream == null)
                        {
--- a/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java
+++ b/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java
@@ -2,7 +2,7 @@
 * #%L
 * Alfresco Transform Core
 * %%
- * Copyright (C) 2005 - 2020 Alfresco Software Limited
+ * Copyright (C) 2005 - 2021 Alfresco Software Limited
 * %%
 * This file is part of the Alfresco software.
 * -
@@ -31,14 +31,20 @@ import org.apache.pdfbox.text.PDFTextStripper;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;

+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.StringWriter;
 import java.util.HashMap;
 import java.util.Map;

 import static org.alfresco.transformer.util.RequestParamMap.PAGE_LIMIT;
+import static org.alfresco.transformer.util.RequestParamMap.SOURCE_ENCODING;
 import static org.junit.jupiter.api.Assertions.assertEquals;

 public class TextToPdfContentTransformerTest
@@ -76,39 +82,125 @@ public class TextToPdfContentTransformerTest
        transformTextAndCheckPageLength(50);
    }

-    private void transformTextAndCheckPageLength(int pageLimit) throws Exception
+    @Test
+    public void test1UTF16BigEndianBomBigEndianChars() throws Exception
+    {
+        // 1. BOM indicates BE (fe then ff) + chars appear to be BE (as first byte read tends to be a zero)
+        //    Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE
+        String expectedByteOrder = "fe ff 00 31 00 20 00 49";
+        transformTextAndCheck("UTF-16", true, true, expectedByteOrder);
+        transformTextAndCheck("UTF-16", true, true, expectedByteOrder);
+        transformTextAndCheck("UTF-16BE", true, true, expectedByteOrder);
+        transformTextAndCheck("UTF-16LE", true, true, expectedByteOrder);
+    }
+
+    @Test
+    public void test2UTF16LittleEndianBomLittleEndianChars() throws Exception
+    {
+        // 2. BOM indicates LE (ff then fe) + chars appear to be LE (as second byte read tends to be a zero)
+        //    Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE
+        transformTextAndCheck("UTF-16", false, true, "ff fe 31 00 20 00 49 00");
+    }
+
+    @Test
+    public void test3UTF16NoBomBigEndianChars() throws Exception
+    {
+        // 3. No BOM + chars appear to be BE (as first byte read tends to be a zero)
+        //    Expected with UTF-16BE
+        transformTextAndCheck("UTF-16", true, null, "00 31 00 20 00 49");
+    }
+
+    @Test
+    public void test4UTF16NoBomLittleEndianChars() throws Exception
+    {
+        // 4. No BOM + chars appear to be LE (as second byte read tends to be a zero)
+        //    Expected with UTF-16LE
+        transformTextAndCheck("UTF-16", false, null, "31 00 20 00 49 00");
+    }
+
+    @Test
+    public void test5UTF16BigEndianBomLittleEndianChars() throws Exception
+    {
+        // 5. BOM indicates BE (fe then ff) + chars appear to be LE (as second byte read tends to be a zero)
+        //    SOMETHING IS WRONG, BUT USE LE!!!!
+        transformTextAndCheck("UTF-16", false, false, "fe ff 31 00 20 00 49 00");
+    }
+
+    @Test
+    public void test6UTF16LittleEndianBomBigEndianChars() throws Exception
+    {
+        // 6. BOM indicates LE (ff then fe) + chars appear to be BE (as first byte read tends to be a zero)
+        //    SOMETHING IS WRONG, BUT USE BE!!!!
+        transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
+    }
+
+    /**
+     * @param encoding to be used to read the source file
+     * @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
+     *                 each char is a zero when using English.
+     * @param validBom if not null, the BOM is included. If true it is the one matching bigEndian. If false it is the
+     *                 opposite byte order, which really is an error, but we try to recover from it.
+     * @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
+     *                 correctly created.
+     */
+    protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom,
+                                         String expectedByteOrder) throws Exception
+    {
+        transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
+    }
+
+    protected void transformTextAndCheckPageLength(int pageLimit) throws Exception
+    {
+        transformTextAndCheckImpl(pageLimit, "UTF-8", null, null, null);
+    }
+
+    private void transformTextAndCheckImpl(int pageLimit, String encoding, Boolean bigEndian, Boolean validBom,
+                                           String expectedByteOrder) throws Exception
+    {
+        StringBuilder sb = new StringBuilder();
+        String checkText = createTestText(pageLimit, sb);
+        String text = sb.toString();
+
+        File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
+        writeToFile(sourceFile, text, encoding, bigEndian, validBom);
+        checkFileBytes(sourceFile, expectedByteOrder);
+
+        transformTextAndCheck(sourceFile, encoding, checkText, String.valueOf(pageLimit));
+    }
+
+    private String createTestText(int pageLimit, StringBuilder sb)
    {
        int pageLength = 32;
        int lines = (pageLength + 10) * ((pageLimit > 0) ? pageLimit : 1);
-        StringBuilder sb = new StringBuilder();
        String checkText = null;
        int cutoff = pageLimit * pageLength;
        for (int i = 1; i <= lines; i++)
        {
-            sb.append(i);
+            sb.append(Integer.toString(i));
            sb.append(" I must not talk in class or feed my homework to my cat.\n");
            if (i == cutoff)
+            {
                checkText = sb.toString();
+            }
        }
        sb.append("\nBart\n");
+
        String text = sb.toString();
-        checkText = (checkText == null) ? clean(text) : clean(checkText);
-        transformTextAndCheck(text, "UTF-8", checkText, String.valueOf(pageLimit));
+        checkText = checkText == null ? clean(text) : clean(checkText);
+
+        return checkText;
    }

-    private void transformTextAndCheck(String text, String encoding, String checkText,
+    private void transformTextAndCheck(File sourceFile, String encoding, String checkText,
        String pageLimit) throws Exception
    {
-        // Get a reader for the text
-        File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
-        writeToFile(sourceFile, text, encoding);
-
        // And a temp writer
        File targetFile = File.createTempFile("AlfrescoTestTarget_", ".pdf");

        // Transform to PDF
        Map<String, String> parameters = new HashMap<>();
        parameters.put(PAGE_LIMIT, pageLimit);
+        parameters.put(SOURCE_ENCODING, encoding);
        transformer.transform("text/plain", "application/pdf", parameters, sourceFile, targetFile);

        // Read back in the PDF and check it
@@ -138,11 +230,123 @@ public class TextToPdfContentTransformerTest
        return text;
    }

-    private void writeToFile(File file, String content, String encoding) throws Exception
+    private void writeToFile(File file, String content, String encoding, Boolean bigEndian, Boolean validBom) throws Exception
    {
+        // If we may have to change the endian or include/exclude the BOM, write initially to a tmp file using
+        // UTF-16 which includes the BOM FEFF.
+        File originalFile = file;
+        if (bigEndian != null)
+        {
+            file = File.createTempFile("AlfrescoTestTmpSrc_", ".txt");
+            encoding = "UTF-16";
+        }
+
+        // Use a writer to use the required encoding
        try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
        {
            ow.append(content);
        }
+
+        // If we may have to change the endian or include/exclude the BOM, copy the raw bytes to the supplied file
+        if (bigEndian != null)
+        {
+            boolean firstRead = true;
+            byte[] bytes = new byte[8192];
+            try (InputStream is = new BufferedInputStream(new FileInputStream(file));
+                 OutputStream os = new BufferedOutputStream(new FileOutputStream(originalFile)))
+            {
+                int l;
+                int off;
+                boolean switchBytes = false;
+                do
+                {
+                    l = is.read(bytes);
+                    off = 0;
+                    // When we read the first block, change the offset if we don't want the BOM and also work out
+                    // if the byte endian need to be switch. The source bytes allways start with a standard BOM.
+                    if (firstRead)
+                    {
+                        firstRead = false;
+                        boolean actualEndianBytes = bytes[0] == (byte)0xfe; // if true [1] would also be 0xff
+                        switchBytes = actualEndianBytes != bigEndian;
+                        if (validBom == null)
+                        {
+                            // Strip the BOM
+                            off = 2;
+                        }
+                        else if (!validBom)
+                        {
+                            // Reverse the BOM so it does not match the characters!
+                            byte aByte = bytes[0];
+                            bytes[0] = bytes[1];
+                            bytes[1] = aByte;
+                        }
+                    }
+                    int len = l - off;
+                    if (len > 0)
+                    {
+                        if (switchBytes)
+                        {
+                            // Reverse the byte order of characters including the BOM.
+                            for (int i=0; i<l; i+=2)
+                            {
+                                byte aByte = bytes[i];
+                                bytes[i] = bytes[i+1];
+                                bytes[i+1] = aByte;
+                            }
+                        }
+                        os.write(bytes, off, len-off);
+                    }
+                } while (l != -1);
+            }
+        }
+    }
+
+    /**
+     * Check the first few bytes in the source file match what we planed to use later as test data.
+     */
+    private void checkFileBytes(File sourceFile, String expectedByteOrder) throws Exception
+    {
+        if (expectedByteOrder != null)
+        {
+            byte[] expectedBytes = hexToBytes(expectedByteOrder); // new BigInteger(expectedByteOrder,16).toByteArray();
+            int l = expectedBytes.length;
+            byte[] actualBytes = new byte[l];
+
+            FileInputStream is = new FileInputStream(sourceFile);
+            is.read(actualBytes, 0, l);
+            String actualByteOrder = bytesToHex(actualBytes);
+            assertEquals(expectedByteOrder, actualByteOrder, "The sourceFile does not contain the expected bytes");
+        }
+    }
+
+    private byte[] hexToBytes(String hexString)
+    {
+        hexString = hexString.replaceAll(" *", "");
+        int len = hexString.length() / 2;
+        byte[] bytes = new byte[len];
+        for (int j=0, i=0; i<len; i++)
+        {
+            int firstDigit = Character.digit(hexString.charAt(j++), 16);
+            int secondDigit = Character.digit(hexString.charAt(j++), 16);
+            bytes[i] = (byte)((firstDigit << 4) + secondDigit);
+        }
+        return bytes;
+    }
+
+    private String bytesToHex(byte[] bytes)
+    {
+        StringBuffer sb = new StringBuffer();
+        int len = bytes.length;
+        for (int i=0; i<len; i++)
+        {
+            if (sb.length() > 0)
+            {
+                sb.append(' ');
+            }
+            sb.append(Character.forDigit((bytes[i] >> 4) & 0xF, 16));
+            sb.append(Character.forDigit((bytes[i] & 0xF), 16));
+        }
+        return sb.toString();
    }
 }