Merge pull request #665 from Alfresco/MNT-22398_handle-utf-bom

[MNT-22398] Handle UTF-8 BOM
This commit is contained in:
tiagosalvado10 2022-09-12 11:49:50 +01:00 committed by GitHub
commit ea83ef9ebc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 206 additions and 110 deletions

View File

@ -23,6 +23,7 @@ branches:
- /^HF\/.+$/ - /^HF\/.+$/
- /^ATS-.*$/ - /^ATS-.*$/
- /^ACS-.*$/ - /^ACS-.*$/
- /^MNT-.*$/
stages: stages:
- name: Veracode Scan - name: Veracode Scan

View File

@ -70,6 +70,11 @@ public class TextToPdfContentTransformer implements SelectableTransformer
private static final byte FE = (byte) 0xFE; private static final byte FE = (byte) 0xFE;
private static final byte FF = (byte) 0xFF; private static final byte FF = (byte) 0xFF;
private static final int UTF8_READ_AHEAD_BYTES = 3;
private static final byte EF = (byte) 0xEF;
private static final byte BB = (byte) 0xBB;
private static final byte BF = (byte) 0xBF;
public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT; public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
private final PagedTextToPDF transformer; private final PagedTextToPDF transformer;
@ -165,115 +170,13 @@ public class TextToPdfContentTransformer implements SelectableTransformer
{ {
logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name); logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
charset = Charset.forName("UTF-16"); charset = Charset.forName("UTF-16");
is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES) is = handleUTF16BOM(is);
{ }
boolean bomRead; else if ("UTF-8".equals(name))
boolean switchByteOrder; {
boolean evenByte = true; logger.debug("Using UTF-8");
charset = Charset.forName("UTF-8");
@Override is = handleUTF8BOM(is);
public int read(byte[] bytes, int off, int len) throws IOException
{
int i = 0;
int b = 0;
for (; i<len; i++)
{
b = read();
if (b == -1)
{
break;
}
bytes[off+i] = (byte)b;
}
return i == 0 && b == -1 ? -1 : i;
}
@Override
public int read() throws IOException
{
if (!bomRead)
{
bomRead = true;
boolean switchBom = false;
byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
int evenZeros = countZeros(bytes, 0);
int oddZeros = countZeros(bytes, 1);
if (evenZeros > oddZeros)
{
if (bytes[0] == FF && bytes[1] == FE)
{
switchByteOrder = true;
switchBom = true;
logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
}
else
{
logger.debug("More even zero bytes, so normal read for big-endian");
}
}
else
{
if (bytes[0] == FE && bytes[1] == FF)
{
switchBom = true;
logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
}
else
{
switchByteOrder = true;
logger.debug("More odd zero bytes, so switch bytes from little-endian");
}
}
if (switchBom)
{
byte b = bytes[0];
bytes[0] = bytes[1];
bytes[1] = b;
}
for (int i = end-1; i>=0; i--)
{
unread(bytes[i]);
}
}
if (switchByteOrder)
{
if (evenByte)
{
int b1 = super.read();
int b2 = super.read();
if (b1 != -1)
{
unread(b1);
}
if (b2 != -1)
{
unread(b2);
}
}
evenByte = !evenByte;
}
return super.read();
}
// Counts the number of even or odd 00 bytes
private int countZeros(byte[] b, int offset)
{
int count = 0;
for (int i=offset; i<UTF16_READ_AHEAD_BYTES; i+=2)
{
if (b[i] == 0)
{
count++;
}
}
return count;
}
};
} }
logger.debug("Processing plain text in encoding " + name); logger.debug("Processing plain text in encoding " + name);
return new InputStreamReader(is, charset); return new InputStreamReader(is, charset);
@ -445,4 +348,178 @@ public class TextToPdfContentTransformer implements SelectableTransformer
throw new IllegalArgumentException(paramName + " parameter must be an integer."); throw new IllegalArgumentException(paramName + " parameter must be an integer.");
} }
} }
/**
* Skips the BOM character for UTF-8 encoding
*/
private InputStream handleUTF8BOM(InputStream is)
{
return new PushbackInputStream(is, UTF8_READ_AHEAD_BYTES)
{
boolean bomRead;
@Override
public int read(byte[] bytes, int off, int len) throws IOException
{
int i = 0;
int b = 0;
for (; i < len; i++)
{
b = read();
if (b == -1)
{
break;
}
bytes[off + i] = (byte) b;
}
return i == 0 && b == -1 ? -1 : i;
}
@Override
public int read() throws IOException
{
if (!bomRead)
{
bomRead = true;
byte[] bytes = new byte[UTF8_READ_AHEAD_BYTES];
int end = in.read(bytes, 0, UTF8_READ_AHEAD_BYTES);
if (bytes[0] == EF && bytes[1] == BB && bytes[2] == BF)
{
logger.warn("UTF-8 BOM detected, it will be skipped");
}
else
{
for (int i = end - 1; i >= 0; i--)
{
unread(bytes[i]);
}
}
}
return super.read();
}
};
}
/**
* Handles the situation where there is a BOM even though the encoding indicates that normally there should not be
* one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too which optionally has the BOM. Rather
* than look at the BOM we look at the number of zero bytes in the first few character. XML files even when not in
* European languages tend to have more even zero bytes when big-endian encoded and more odd zero bytes when
* little-endian. Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but other
* transformers do.
*/
private InputStream handleUTF16BOM(InputStream is)
{
return new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
{
boolean bomRead;
boolean switchByteOrder;
boolean evenByte = true;
@Override
public int read(byte[] bytes, int off, int len) throws IOException
{
int i = 0;
int b = 0;
for (; i < len; i++)
{
b = read();
if (b == -1)
{
break;
}
bytes[off + i] = (byte) b;
}
return i == 0 && b == -1 ? -1 : i;
}
@Override
public int read() throws IOException
{
if (!bomRead)
{
bomRead = true;
boolean switchBom = false;
byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
int evenZeros = countZeros(bytes, 0);
int oddZeros = countZeros(bytes, 1);
if (evenZeros > oddZeros)
{
if (bytes[0] == FF && bytes[1] == FE)
{
switchByteOrder = true;
switchBom = true;
logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
}
else
{
logger.debug("More even zero bytes, so normal read for big-endian");
}
}
else
{
if (bytes[0] == FE && bytes[1] == FF)
{
switchBom = true;
logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
}
else
{
switchByteOrder = true;
logger.debug("More odd zero bytes, so switch bytes from little-endian");
}
}
if (switchBom)
{
byte b = bytes[0];
bytes[0] = bytes[1];
bytes[1] = b;
}
for (int i = end - 1; i >= 0; i--)
{
unread(bytes[i]);
}
}
if (switchByteOrder)
{
if (evenByte)
{
int b1 = super.read();
int b2 = super.read();
if (b1 != -1)
{
unread(b1);
}
if (b2 != -1)
{
unread(b2);
}
}
evenByte = !evenByte;
}
return super.read();
}
// Counts the number of even or odd 00 bytes
private int countZeros(byte[] b, int offset)
{
int count = 0;
for (int i = offset; i < UTF16_READ_AHEAD_BYTES; i += 2)
{
if (b[i] == 0)
{
count++;
}
}
return count;
}
};
}
} }

View File

@ -134,6 +134,18 @@ public class TextToPdfContentTransformerTest
transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49"); transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
} }
@Test
public void testUTF8WithBOM() throws Exception
{
transformTextAndCheck("UTF-8", null, true, "ef bb bf 31 20 49 20 6d");
}
@Test
public void testUTF8WithoutBOM() throws Exception
{
transformTextAndCheck("UTF-8", null, false, "31 20 49 20 6d 75 73 74");
}
/** /**
* @param encoding to be used to read the source file * @param encoding to be used to read the source file
* @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of * @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
@ -143,7 +155,7 @@ public class TextToPdfContentTransformerTest
* @param expectedByteOrder The first few bytes of the source file so we can check the test data has been * @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
* correctly created. * correctly created.
*/ */
protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom, protected void transformTextAndCheck(String encoding, Boolean bigEndian, Boolean validBom,
String expectedByteOrder) throws Exception String expectedByteOrder) throws Exception
{ {
transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder); transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
@ -244,6 +256,12 @@ public class TextToPdfContentTransformerTest
// Use a writer to use the required encoding // Use a writer to use the required encoding
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding)) try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
{ {
// Add BOM to UTF-8 file
if (bigEndian == null && encoding != null && "UTF-8".equals(encoding.toUpperCase()) && validBom != null && validBom)
{
ow.append("\ufeff");
}
ow.append(content); ow.append(content);
} }