[MNT-22398] Handle UTF-8 BOM

2025-10-01 14:41:17 +00:00 · 2022-08-29 16:21:54 +01:00
parent e0e925b6bd
commit b3cbf91102
3 changed files with 206 additions and 117 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -23,6 +23,7 @@ branches:
    - /^HF\/.+$/
    - /^ATS-.*$/
    - /^ACS-.*$/
    - /^MNT-.*$/
 stages:
  - name: Veracode Scan
--- a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java
+++ b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java
@@ -70,6 +70,11 @@ public class TextToPdfContentTransformer implements SelectableTransformer
    private static final byte FE = (byte) 0xFE;
    private static final byte FF = (byte) 0xFF;
    private static final int UTF8_READ_AHEAD_BYTES = 3;
    private static final byte EF = (byte) 0xEF;
    private static final byte BB = (byte) 0xBB;
    private static final byte BF = (byte) 0xBF;
    public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
    private final PagedTextToPDF transformer;
@@ -153,19 +158,40 @@ public class TextToPdfContentTransformer implements SelectableTransformer
            }
            if (charset != null)
            {
                // Handles the situation where there is a BOM even though the encoding indicates that normally
                // there should not be one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too
                // which optionally has the BOM. Rather than look at the BOM we look at the number of zero bytes
                // in the first few character. XML files even when not in European languages tend to have more
                // even zero bytes when big-endian encoded and more odd zero bytes when little-endian.
                // Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but
                // other transformers do.
                String name = charset.displayName();
                if ("UTF-16".equals(name) || "UTF-16BE".equals(name) || "UTF-16LE".equals(name))
                {
                    logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
                    charset = Charset.forName("UTF-16");
-                    is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
+                    is = handleUTF16BOM(is);
                }
                else if ("UTF-8".equals(name))
                {
                    logger.debug("Using UTF-8");
                    charset = Charset.forName("UTF-8");
                    is = handleUTF8BOM(is);
                }
                logger.debug("Processing plain text in encoding " + name);
                return new InputStreamReader(is, charset);
            }
        }
        // Fall back on the system default
        logger.debug("Processing plain text using system default encoding");
        return new InputStreamReader(is);
    }
    /**
     * Handles the situation where there is a BOM even though the encoding indicates that normally there should not be
     * one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too which optionally has the BOM. Rather
     * than look at the BOM we look at the number of zero bytes in the first few character. XML files even when not in
     * European languages tend to have more even zero bytes when big-endian encoded and more odd zero bytes when
     * little-endian. Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but other
     * transformers do.
     */
    private InputStream handleUTF16BOM(InputStream is)
    {
        return new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
        {
            boolean bomRead;
            boolean switchByteOrder;
@@ -275,14 +301,58 @@ public class TextToPdfContentTransformer implements SelectableTransformer
            }
        };
    }
-                logger.debug("Processing plain text in encoding " + name);
+
-                return new InputStreamReader(is, charset);
+    /**
     * Skips the BOM for UTF-8 encoding
     */
    private InputStream handleUTF8BOM(InputStream is)
    {
        return new PushbackInputStream(is, UTF8_READ_AHEAD_BYTES)
        {
            boolean bomRead;
            @Override
            public int read(byte[] bytes, int off, int len) throws IOException
            {
                int i = 0;
                int b = 0;
                for (; i < len; i++)
                {
                    b = read();
                    if (b == -1)
                    {
                        break;
                    }
                    bytes[off + i] = (byte) b;
                }
                return i == 0 && b == -1 ? -1 : i;
            }
            @Override
            public int read() throws IOException
            {
                if (!bomRead)
                {
                    bomRead = true;
                    byte[] bytes = new byte[UTF8_READ_AHEAD_BYTES];
                    int end = in.read(bytes, 0, UTF8_READ_AHEAD_BYTES);
                    if (bytes[0] == EF && bytes[1] == BB && bytes[2] == BF)
                    {
                        logger.warn("UTF-8 BOM detected");
                    }
                    else
                    {
                        for (int i = end - 1; i >= 0; i--)
                        {
                            unread(bytes[i]);
                        }
                    }
                }
-        // Fall back on the system default
+                return super.read();
-        logger.debug("Processing plain text using system default encoding");
+            }
-        return new InputStreamReader(is);
+        };
    }
    private static class PagedTextToPDF extends TextToPDF
--- a/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java
+++ b/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java
@@ -134,6 +134,18 @@ public class TextToPdfContentTransformerTest
        transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
    }
    @Test
    public void testUTF8WithBOM() throws Exception
    {
        transformTextAndCheck("UTF-8", null, true, "ef bb bf 31 20 49 20 6d");
    }
    @Test
    public void testUTF8WithoutBOM() throws Exception
    {
        transformTextAndCheck("UTF-8", null, false, "31 20 49 20 6d 75 73 74");
    }
    /**
     * @param encoding to be used to read the source file
     * @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
@@ -143,7 +155,7 @@ public class TextToPdfContentTransformerTest
     * @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
     *                 correctly created.
     */
-    protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom,
+    protected void transformTextAndCheck(String encoding, Boolean bigEndian, Boolean validBom,
                                         String expectedByteOrder) throws Exception
    {
        transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
@@ -244,6 +256,12 @@ public class TextToPdfContentTransformerTest
        // Use a writer to use the required encoding
        try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
        {
            // Add BOM to UTF-8 file
            if (bigEndian == null && encoding != null && "UTF-8".equals(encoding.toUpperCase()) && validBom != null && validBom)
            {
                ow.append("\ufeff");
            }
            ow.append(content);
        }