[MNT-22398] Handle UTF-8 BOM

This commit is contained in:
Tiago Salvado
2022-08-29 16:21:54 +01:00
parent e0e925b6bd
commit b3cbf91102
3 changed files with 206 additions and 117 deletions

View File

@@ -23,6 +23,7 @@ branches:
- /^HF\/.+$/ - /^HF\/.+$/
- /^ATS-.*$/ - /^ATS-.*$/
- /^ACS-.*$/ - /^ACS-.*$/
- /^MNT-.*$/
stages: stages:
- name: Veracode Scan - name: Veracode Scan

View File

@@ -70,6 +70,11 @@ public class TextToPdfContentTransformer implements SelectableTransformer
private static final byte FE = (byte) 0xFE; private static final byte FE = (byte) 0xFE;
private static final byte FF = (byte) 0xFF; private static final byte FF = (byte) 0xFF;
private static final int UTF8_READ_AHEAD_BYTES = 3;
private static final byte EF = (byte) 0xEF;
private static final byte BB = (byte) 0xBB;
private static final byte BF = (byte) 0xBF;
public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT; public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
private final PagedTextToPDF transformer; private final PagedTextToPDF transformer;
@@ -153,19 +158,40 @@ public class TextToPdfContentTransformer implements SelectableTransformer
} }
if (charset != null) if (charset != null)
{ {
// Handles the situation where there is a BOM even though the encoding indicates that normally
// there should not be one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too
// which optionally has the BOM. Rather than look at the BOM we look at the number of zero bytes
// in the first few character. XML files even when not in European languages tend to have more
// even zero bytes when big-endian encoded and more odd zero bytes when little-endian.
// Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but
// other transformers do.
String name = charset.displayName(); String name = charset.displayName();
if ("UTF-16".equals(name) || "UTF-16BE".equals(name) || "UTF-16LE".equals(name)) if ("UTF-16".equals(name) || "UTF-16BE".equals(name) || "UTF-16LE".equals(name))
{ {
logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name); logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
charset = Charset.forName("UTF-16"); charset = Charset.forName("UTF-16");
is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES) is = handleUTF16BOM(is);
}
else if ("UTF-8".equals(name))
{
logger.debug("Using UTF-8");
charset = Charset.forName("UTF-8");
is = handleUTF8BOM(is);
}
logger.debug("Processing plain text in encoding " + name);
return new InputStreamReader(is, charset);
}
}
// Fall back on the system default
logger.debug("Processing plain text using system default encoding");
return new InputStreamReader(is);
}
/**
* Handles the situation where there is a BOM even though the encoding indicates that normally there should not be
* one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too which optionally has the BOM. Rather
* than look at the BOM we look at the number of zero bytes in the first few character. XML files even when not in
* European languages tend to have more even zero bytes when big-endian encoded and more odd zero bytes when
* little-endian. Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but other
* transformers do.
*/
private InputStream handleUTF16BOM(InputStream is)
{
return new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
{ {
boolean bomRead; boolean bomRead;
boolean switchByteOrder; boolean switchByteOrder;
@@ -275,14 +301,58 @@ public class TextToPdfContentTransformer implements SelectableTransformer
} }
}; };
} }
logger.debug("Processing plain text in encoding " + name);
return new InputStreamReader(is, charset); /**
* Skips the BOM for UTF-8 encoding
*/
private InputStream handleUTF8BOM(InputStream is)
{
return new PushbackInputStream(is, UTF8_READ_AHEAD_BYTES)
{
boolean bomRead;
@Override
public int read(byte[] bytes, int off, int len) throws IOException
{
int i = 0;
int b = 0;
for (; i < len; i++)
{
b = read();
if (b == -1)
{
break;
}
bytes[off + i] = (byte) b;
}
return i == 0 && b == -1 ? -1 : i;
}
@Override
public int read() throws IOException
{
if (!bomRead)
{
bomRead = true;
byte[] bytes = new byte[UTF8_READ_AHEAD_BYTES];
int end = in.read(bytes, 0, UTF8_READ_AHEAD_BYTES);
if (bytes[0] == EF && bytes[1] == BB && bytes[2] == BF)
{
logger.warn("UTF-8 BOM detected");
}
else
{
for (int i = end - 1; i >= 0; i--)
{
unread(bytes[i]);
}
} }
} }
// Fall back on the system default return super.read();
logger.debug("Processing plain text using system default encoding"); }
return new InputStreamReader(is); };
} }
private static class PagedTextToPDF extends TextToPDF private static class PagedTextToPDF extends TextToPDF

View File

@@ -134,6 +134,18 @@ public class TextToPdfContentTransformerTest
transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49"); transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
} }
@Test
public void testUTF8WithBOM() throws Exception
{
transformTextAndCheck("UTF-8", null, true, "ef bb bf 31 20 49 20 6d");
}
@Test
public void testUTF8WithoutBOM() throws Exception
{
transformTextAndCheck("UTF-8", null, false, "31 20 49 20 6d 75 73 74");
}
/** /**
* @param encoding to be used to read the source file * @param encoding to be used to read the source file
* @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of * @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
@@ -143,7 +155,7 @@ public class TextToPdfContentTransformerTest
* @param expectedByteOrder The first few bytes of the source file so we can check the test data has been * @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
* correctly created. * correctly created.
*/ */
protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom, protected void transformTextAndCheck(String encoding, Boolean bigEndian, Boolean validBom,
String expectedByteOrder) throws Exception String expectedByteOrder) throws Exception
{ {
transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder); transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
@@ -244,6 +256,12 @@ public class TextToPdfContentTransformerTest
// Use a writer to use the required encoding // Use a writer to use the required encoding
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding)) try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
{ {
// Add BOM to UTF-8 file
if (bigEndian == null && encoding != null && "UTF-8".equals(encoding.toUpperCase()) && validBom != null && validBom)
{
ow.append("\ufeff");
}
ow.append(content); ow.append(content);
} }