Merge pull request #665 from Alfresco/MNT-22398_handle-utf-bom

[MNT-22398] Handle UTF-8 BOM
2025-10-01 14:41:17 +00:00 · 2022-09-12 11:49:50 +01:00
parent 9308cce484 e605f4e060
commit ea83ef9ebc
3 changed files with 206 additions and 110 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -23,6 +23,7 @@ branches:
    - /^HF\/.+$/
    - /^ATS-.*$/
    - /^ACS-.*$/
    - /^MNT-.*$/
 stages:
  - name: Veracode Scan
--- a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java
+++ b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java
@@ -70,6 +70,11 @@ public class TextToPdfContentTransformer implements SelectableTransformer
    private static final byte FE = (byte) 0xFE;
    private static final byte FF = (byte) 0xFF;
    private static final int UTF8_READ_AHEAD_BYTES = 3;
    private static final byte EF = (byte) 0xEF;
    private static final byte BB = (byte) 0xBB;
    private static final byte BF = (byte) 0xBF;
    public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
    private final PagedTextToPDF transformer;
@@ -165,115 +170,13 @@ public class TextToPdfContentTransformer implements SelectableTransformer
                {
                    logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
                    charset = Charset.forName("UTF-16");
-                    is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
+                    is = handleUTF16BOM(is);
-                    {
+                }
-                        boolean bomRead;
+                else if ("UTF-8".equals(name))
-                        boolean switchByteOrder;
+                {
-                        boolean evenByte = true;
+                    logger.debug("Using UTF-8");
-
+                    charset = Charset.forName("UTF-8");
-                        @Override
+                    is = handleUTF8BOM(is);
                        public int read(byte[] bytes, int off, int len) throws IOException
                        {
                            int i = 0;
                            int b = 0;
                            for (; i<len; i++)
                            {
                                b = read();
                                if (b == -1)
                                {
                                    break;
                                }
                                bytes[off+i] = (byte)b;
                            }
                            return i == 0 && b == -1 ? -1 : i;
                        }
                        @Override
                        public int read() throws IOException
                        {
                            if (!bomRead)
                            {
                                bomRead = true;
                                boolean switchBom = false;
                                byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
                                int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
                                int evenZeros = countZeros(bytes, 0);
                                int oddZeros = countZeros(bytes, 1);
                                if (evenZeros > oddZeros)
                                {
                                    if (bytes[0] == FF && bytes[1] == FE)
                                    {
                                        switchByteOrder = true;
                                        switchBom = true;
                                        logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
                                    }
                                    else
                                    {
                                        logger.debug("More even zero bytes, so normal read for big-endian");
                                    }
                                }
                                else
                                {
                                    if (bytes[0] == FE && bytes[1] == FF)
                                    {
                                        switchBom = true;
                                        logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
                                    }
                                    else
                                    {
                                        switchByteOrder = true;
                                        logger.debug("More odd zero bytes, so switch bytes from little-endian");
                                    }
                                }
                                if (switchBom)
                                {
                                    byte b = bytes[0];
                                    bytes[0] = bytes[1];
                                    bytes[1] = b;
                                }
                                for (int i = end-1; i>=0; i--)
                                {
                                    unread(bytes[i]);
                                }
                            }
                            if (switchByteOrder)
                            {
                                if (evenByte)
                                {
                                    int b1 = super.read();
                                    int b2 = super.read();
                                    if (b1 != -1)
                                    {
                                        unread(b1);
                                    }
                                    if (b2 != -1)
                                    {
                                        unread(b2);
                                    }
                                }
                                evenByte = !evenByte;
                            }
                            return super.read();
                        }
                        // Counts the number of even or odd 00 bytes
                        private int countZeros(byte[] b, int offset)
                        {
                            int count = 0;
                            for (int i=offset; i<UTF16_READ_AHEAD_BYTES; i+=2)
                            {
                                if (b[i] == 0)
                                {
                                    count++;
                                }
                            }
                            return count;
                        }
                    };
                }
                logger.debug("Processing plain text in encoding " + name);
                return new InputStreamReader(is, charset);
@@ -445,4 +348,178 @@ public class TextToPdfContentTransformer implements SelectableTransformer
            throw new IllegalArgumentException(paramName + " parameter must be an integer.");
        }
    }
    /**
     * Skips the BOM character for UTF-8 encoding
     */
    private InputStream handleUTF8BOM(InputStream is)
    {
        return new PushbackInputStream(is, UTF8_READ_AHEAD_BYTES)
        {
            boolean bomRead;
            @Override
            public int read(byte[] bytes, int off, int len) throws IOException
            {
                int i = 0;
                int b = 0;
                for (; i < len; i++)
                {
                    b = read();
                    if (b == -1)
                    {
                        break;
                    }
                    bytes[off + i] = (byte) b;
                }
                return i == 0 && b == -1 ? -1 : i;
            }
            @Override
            public int read() throws IOException
            {
                if (!bomRead)
                {
                    bomRead = true;
                    byte[] bytes = new byte[UTF8_READ_AHEAD_BYTES];
                    int end = in.read(bytes, 0, UTF8_READ_AHEAD_BYTES);
                    if (bytes[0] == EF && bytes[1] == BB && bytes[2] == BF)
                    {
                        logger.warn("UTF-8 BOM detected, it will be skipped");
                    }
                    else
                    {
                        for (int i = end - 1; i >= 0; i--)
                        {
                            unread(bytes[i]);
                        }
                    }
                }
                return super.read();
            }
        };
    }
    /**
     * Handles the situation where there is a BOM even though the encoding indicates that normally there should not be
     * one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too which optionally has the BOM. Rather
     * than look at the BOM we look at the number of zero bytes in the first few character. XML files even when not in
     * European languages tend to have more even zero bytes when big-endian encoded and more odd zero bytes when
     * little-endian. Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but other
     * transformers do.
     */
    private InputStream handleUTF16BOM(InputStream is)
    {
        return new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
        {
            boolean bomRead;
            boolean switchByteOrder;
            boolean evenByte = true;
            @Override
            public int read(byte[] bytes, int off, int len) throws IOException
            {
                int i = 0;
                int b = 0;
                for (; i < len; i++)
                {
                    b = read();
                    if (b == -1)
                    {
                        break;
                    }
                    bytes[off + i] = (byte) b;
                }
                return i == 0 && b == -1 ? -1 : i;
            }
            @Override
            public int read() throws IOException
            {
                if (!bomRead)
                {
                    bomRead = true;
                    boolean switchBom = false;
                    byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
                    int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
                    int evenZeros = countZeros(bytes, 0);
                    int oddZeros = countZeros(bytes, 1);
                    if (evenZeros > oddZeros)
                    {
                        if (bytes[0] == FF && bytes[1] == FE)
                        {
                            switchByteOrder = true;
                            switchBom = true;
                            logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
                        }
                        else
                        {
                            logger.debug("More even zero bytes, so normal read for big-endian");
                        }
                    }
                    else
                    {
                        if (bytes[0] == FE && bytes[1] == FF)
                        {
                            switchBom = true;
                            logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
                        }
                        else
                        {
                            switchByteOrder = true;
                            logger.debug("More odd zero bytes, so switch bytes from little-endian");
                        }
                    }
                    if (switchBom)
                    {
                        byte b = bytes[0];
                        bytes[0] = bytes[1];
                        bytes[1] = b;
                    }
                    for (int i = end - 1; i >= 0; i--)
                    {
                        unread(bytes[i]);
                    }
                }
                if (switchByteOrder)
                {
                    if (evenByte)
                    {
                        int b1 = super.read();
                        int b2 = super.read();
                        if (b1 != -1)
                        {
                            unread(b1);
                        }
                        if (b2 != -1)
                        {
                            unread(b2);
                        }
                    }
                    evenByte = !evenByte;
                }
                return super.read();
            }
            // Counts the number of even or odd 00 bytes
            private int countZeros(byte[] b, int offset)
            {
                int count = 0;
                for (int i = offset; i < UTF16_READ_AHEAD_BYTES; i += 2)
                {
                    if (b[i] == 0)
                    {
                        count++;
                    }
                }
                return count;
            }
        };
    }
 }
--- a/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java
+++ b/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java
@@ -134,6 +134,18 @@ public class TextToPdfContentTransformerTest
        transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
    }
    @Test
    public void testUTF8WithBOM() throws Exception
    {
        transformTextAndCheck("UTF-8", null, true, "ef bb bf 31 20 49 20 6d");
    }
    @Test
    public void testUTF8WithoutBOM() throws Exception
    {
        transformTextAndCheck("UTF-8", null, false, "31 20 49 20 6d 75 73 74");
    }
    /**
     * @param encoding to be used to read the source file
     * @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
@@ -143,7 +155,7 @@ public class TextToPdfContentTransformerTest
     * @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
     *                 correctly created.
     */
-    protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom,
+    protected void transformTextAndCheck(String encoding, Boolean bigEndian, Boolean validBom,
                                         String expectedByteOrder) throws Exception
    {
        transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
@@ -244,6 +256,12 @@ public class TextToPdfContentTransformerTest
        // Use a writer to use the required encoding
        try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
        {
            // Add BOM to UTF-8 file
            if (bigEndian == null && encoding != null && "UTF-8".equals(encoding.toUpperCase()) && validBom != null && validBom)
            {
                ow.append("\ufeff");
            }
            ow.append(content);
        }