Merge pull request #665 from Alfresco/MNT-22398_handle-utf-bom

[MNT-22398] Handle UTF-8 BOM
2025-06-30 18:14:51 +00:00 · 2022-09-12 11:49:50 +01:00 · 2022-09-12 11:49:50 +01:00 · ea83ef9ebc
commit ea83ef9ebc
parent 9308cce484 e605f4e060
3 changed files with 206 additions and 110 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -23,6 +23,7 @@ branches:
    - /^HF\/.+$/
    - /^ATS-.*$/
    - /^ACS-.*$/
+    - /^MNT-.*$/

 stages:
  - name: Veracode Scan
--- a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java
+++ b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java
@ -70,6 +70,11 @@ public class TextToPdfContentTransformer implements SelectableTransformer
    private static final byte FE = (byte) 0xFE;
    private static final byte FF = (byte) 0xFF;

+    private static final int UTF8_READ_AHEAD_BYTES = 3;
+    private static final byte EF = (byte) 0xEF;
+    private static final byte BB = (byte) 0xBB;
+    private static final byte BF = (byte) 0xBF;
+
    public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;

    private final PagedTextToPDF transformer;
@ -165,115 +170,13 @@ public class TextToPdfContentTransformer implements SelectableTransformer
                {
                    logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
                    charset = Charset.forName("UTF-16");
-                    is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
-                    {
-                        boolean bomRead;
-                        boolean switchByteOrder;
-                        boolean evenByte = true;
-
-                        @Override
-                        public int read(byte[] bytes, int off, int len) throws IOException
-                        {
-                            int i = 0;
-                            int b = 0;
-                            for (; i<len; i++)
-                            {
-                                b = read();
-                                if (b == -1)
-                                {
-                                    break;
+                    is = handleUTF16BOM(is);
                }
-                                bytes[off+i] = (byte)b;
-                            }
-                            return i == 0 && b == -1 ? -1 : i;
-                        }
-
-                        @Override
-                        public int read() throws IOException
+                else if ("UTF-8".equals(name))
                {
-                            if (!bomRead)
-                            {
-                                bomRead = true;
-                                boolean switchBom = false;
-                                byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
-                                int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
-                                int evenZeros = countZeros(bytes, 0);
-                                int oddZeros = countZeros(bytes, 1);
-                                if (evenZeros > oddZeros)
-                                {
-                                    if (bytes[0] == FF && bytes[1] == FE)
-                                    {
-                                        switchByteOrder = true;
-                                        switchBom = true;
-                                        logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
-                                    }
-                                    else
-                                    {
-                                        logger.debug("More even zero bytes, so normal read for big-endian");
-                                    }
-                                }
-                                else
-                                {
-                                    if (bytes[0] == FE && bytes[1] == FF)
-                                    {
-                                        switchBom = true;
-                                        logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
-                                    }
-                                    else
-                                    {
-                                        switchByteOrder = true;
-                                        logger.debug("More odd zero bytes, so switch bytes from little-endian");
-                                    }
-                                }
-
-                                if (switchBom)
-                                {
-                                    byte b = bytes[0];
-                                    bytes[0] = bytes[1];
-                                    bytes[1] = b;
-                                }
-
-                                for (int i = end-1; i>=0; i--)
-                                {
-                                    unread(bytes[i]);
-                                }
-                            }
-
-                            if (switchByteOrder)
-                            {
-                                if (evenByte)
-                                {
-                                    int b1 = super.read();
-                                    int b2 = super.read();
-                                    if (b1 != -1)
-                                    {
-                                        unread(b1);
-                                    }
-                                    if (b2 != -1)
-                                    {
-                                        unread(b2);
-                                    }
-                                }
-                                evenByte = !evenByte;
-                            }
-
-                            return super.read();
-                        }
-
-                        // Counts the number of even or odd 00 bytes
-                        private int countZeros(byte[] b, int offset)
-                        {
-                            int count = 0;
-                            for (int i=offset; i<UTF16_READ_AHEAD_BYTES; i+=2)
-                            {
-                                if (b[i] == 0)
-                                {
-                                    count++;
-                                }
-                            }
-                            return count;
-                        }
-                    };
+                    logger.debug("Using UTF-8");
+                    charset = Charset.forName("UTF-8");
+                    is = handleUTF8BOM(is);
                }
                logger.debug("Processing plain text in encoding " + name);
                return new InputStreamReader(is, charset);
@ -445,4 +348,178 @@ public class TextToPdfContentTransformer implements SelectableTransformer
            throw new IllegalArgumentException(paramName + " parameter must be an integer.");
        }
    }
+
+    /**
+     * Skips the BOM character for UTF-8 encoding
+     */
+    private InputStream handleUTF8BOM(InputStream is)
+    {
+        return new PushbackInputStream(is, UTF8_READ_AHEAD_BYTES)
+        {
+            boolean bomRead;
+
+            @Override
+            public int read(byte[] bytes, int off, int len) throws IOException
+            {
+                int i = 0;
+                int b = 0;
+                for (; i < len; i++)
+                {
+                    b = read();
+                    if (b == -1)
+                    {
+                        break;
+                    }
+                    bytes[off + i] = (byte) b;
+                }
+                return i == 0 && b == -1 ? -1 : i;
+            }
+
+            @Override
+            public int read() throws IOException
+            {
+                if (!bomRead)
+                {
+                    bomRead = true;
+                    byte[] bytes = new byte[UTF8_READ_AHEAD_BYTES];
+                    int end = in.read(bytes, 0, UTF8_READ_AHEAD_BYTES);
+
+                    if (bytes[0] == EF && bytes[1] == BB && bytes[2] == BF)
+                    {
+                        logger.warn("UTF-8 BOM detected, it will be skipped");
+                    }
+                    else
+                    {
+                        for (int i = end - 1; i >= 0; i--)
+                        {
+                            unread(bytes[i]);
+                        }
+                    }
+                }
+
+                return super.read();
+            }
+        };
+    }
+
+    /**
+     * Handles the situation where there is a BOM even though the encoding indicates that normally there should not be
+     * one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too which optionally has the BOM. Rather
+     * than look at the BOM we look at the number of zero bytes in the first few character. XML files even when not in
+     * European languages tend to have more even zero bytes when big-endian encoded and more odd zero bytes when
+     * little-endian. Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but other
+     * transformers do.
+     */
+    private InputStream handleUTF16BOM(InputStream is)
+    {
+        return new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
+        {
+            boolean bomRead;
+            boolean switchByteOrder;
+            boolean evenByte = true;
+
+            @Override
+            public int read(byte[] bytes, int off, int len) throws IOException
+            {
+                int i = 0;
+                int b = 0;
+                for (; i < len; i++)
+                {
+                    b = read();
+                    if (b == -1)
+                    {
+                        break;
+                    }
+                    bytes[off + i] = (byte) b;
+                }
+                return i == 0 && b == -1 ? -1 : i;
+            }
+
+            @Override
+            public int read() throws IOException
+            {
+                if (!bomRead)
+                {
+                    bomRead = true;
+                    boolean switchBom = false;
+                    byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
+                    int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
+                    int evenZeros = countZeros(bytes, 0);
+                    int oddZeros = countZeros(bytes, 1);
+                    if (evenZeros > oddZeros)
+                    {
+                        if (bytes[0] == FF && bytes[1] == FE)
+                        {
+                            switchByteOrder = true;
+                            switchBom = true;
+                            logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
+                        }
+                        else
+                        {
+                            logger.debug("More even zero bytes, so normal read for big-endian");
+                        }
+                    }
+                    else
+                    {
+                        if (bytes[0] == FE && bytes[1] == FF)
+                        {
+                            switchBom = true;
+                            logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
+                        }
+                        else
+                        {
+                            switchByteOrder = true;
+                            logger.debug("More odd zero bytes, so switch bytes from little-endian");
+                        }
+                    }
+
+                    if (switchBom)
+                    {
+                        byte b = bytes[0];
+                        bytes[0] = bytes[1];
+                        bytes[1] = b;
+                    }
+
+                    for (int i = end - 1; i >= 0; i--)
+                    {
+                        unread(bytes[i]);
+                    }
+                }
+
+                if (switchByteOrder)
+                {
+                    if (evenByte)
+                    {
+                        int b1 = super.read();
+                        int b2 = super.read();
+                        if (b1 != -1)
+                        {
+                            unread(b1);
+                        }
+                        if (b2 != -1)
+                        {
+                            unread(b2);
+                        }
+                    }
+                    evenByte = !evenByte;
+                }
+
+                return super.read();
+            }
+
+            // Counts the number of even or odd 00 bytes
+            private int countZeros(byte[] b, int offset)
+            {
+                int count = 0;
+                for (int i = offset; i < UTF16_READ_AHEAD_BYTES; i += 2)
+                {
+                    if (b[i] == 0)
+                    {
+                        count++;
+                    }
+                }
+                return count;
+            }
+        };
+    }
 }
--- a/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java
+++ b/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java
@ -134,6 +134,18 @@ public class TextToPdfContentTransformerTest
        transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
    }

+    @Test
+    public void testUTF8WithBOM() throws Exception
+    {
+        transformTextAndCheck("UTF-8", null, true, "ef bb bf 31 20 49 20 6d");
+    }
+
+    @Test
+    public void testUTF8WithoutBOM() throws Exception
+    {
+        transformTextAndCheck("UTF-8", null, false, "31 20 49 20 6d 75 73 74");
+    }
+
    /**
     * @param encoding to be used to read the source file
     * @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
@ -143,7 +155,7 @@ public class TextToPdfContentTransformerTest
     * @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
     *                 correctly created.
     */
-    protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom,
+    protected void transformTextAndCheck(String encoding, Boolean bigEndian, Boolean validBom,
                                         String expectedByteOrder) throws Exception
    {
        transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
@ -244,6 +256,12 @@ public class TextToPdfContentTransformerTest
        // Use a writer to use the required encoding
        try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
        {
+            // Add BOM to UTF-8 file
+            if (bigEndian == null && encoding != null && "UTF-8".equals(encoding.toUpperCase()) && validBom != null && validBom)
+            {
+                ow.append("\ufeff");
+            }
+
            ow.append(content);
        }