mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-05-12 17:04:48 +00:00
Merge pull request #665 from Alfresco/MNT-22398_handle-utf-bom
[MNT-22398] Handle UTF-8 BOM
This commit is contained in:
commit
ea83ef9ebc
@ -23,6 +23,7 @@ branches:
|
||||
- /^HF\/.+$/
|
||||
- /^ATS-.*$/
|
||||
- /^ACS-.*$/
|
||||
- /^MNT-.*$/
|
||||
|
||||
stages:
|
||||
- name: Veracode Scan
|
||||
|
@ -70,6 +70,11 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
private static final byte FE = (byte) 0xFE;
|
||||
private static final byte FF = (byte) 0xFF;
|
||||
|
||||
private static final int UTF8_READ_AHEAD_BYTES = 3;
|
||||
private static final byte EF = (byte) 0xEF;
|
||||
private static final byte BB = (byte) 0xBB;
|
||||
private static final byte BF = (byte) 0xBF;
|
||||
|
||||
public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
|
||||
|
||||
private final PagedTextToPDF transformer;
|
||||
@ -165,115 +170,13 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
{
|
||||
logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
|
||||
charset = Charset.forName("UTF-16");
|
||||
is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
|
||||
{
|
||||
boolean bomRead;
|
||||
boolean switchByteOrder;
|
||||
boolean evenByte = true;
|
||||
|
||||
@Override
|
||||
public int read(byte[] bytes, int off, int len) throws IOException
|
||||
{
|
||||
int i = 0;
|
||||
int b = 0;
|
||||
for (; i<len; i++)
|
||||
{
|
||||
b = read();
|
||||
if (b == -1)
|
||||
{
|
||||
break;
|
||||
is = handleUTF16BOM(is);
|
||||
}
|
||||
bytes[off+i] = (byte)b;
|
||||
}
|
||||
return i == 0 && b == -1 ? -1 : i;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException
|
||||
else if ("UTF-8".equals(name))
|
||||
{
|
||||
if (!bomRead)
|
||||
{
|
||||
bomRead = true;
|
||||
boolean switchBom = false;
|
||||
byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
|
||||
int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
|
||||
int evenZeros = countZeros(bytes, 0);
|
||||
int oddZeros = countZeros(bytes, 1);
|
||||
if (evenZeros > oddZeros)
|
||||
{
|
||||
if (bytes[0] == FF && bytes[1] == FE)
|
||||
{
|
||||
switchByteOrder = true;
|
||||
switchBom = true;
|
||||
logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.debug("More even zero bytes, so normal read for big-endian");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (bytes[0] == FE && bytes[1] == FF)
|
||||
{
|
||||
switchBom = true;
|
||||
logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
|
||||
}
|
||||
else
|
||||
{
|
||||
switchByteOrder = true;
|
||||
logger.debug("More odd zero bytes, so switch bytes from little-endian");
|
||||
}
|
||||
}
|
||||
|
||||
if (switchBom)
|
||||
{
|
||||
byte b = bytes[0];
|
||||
bytes[0] = bytes[1];
|
||||
bytes[1] = b;
|
||||
}
|
||||
|
||||
for (int i = end-1; i>=0; i--)
|
||||
{
|
||||
unread(bytes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (switchByteOrder)
|
||||
{
|
||||
if (evenByte)
|
||||
{
|
||||
int b1 = super.read();
|
||||
int b2 = super.read();
|
||||
if (b1 != -1)
|
||||
{
|
||||
unread(b1);
|
||||
}
|
||||
if (b2 != -1)
|
||||
{
|
||||
unread(b2);
|
||||
}
|
||||
}
|
||||
evenByte = !evenByte;
|
||||
}
|
||||
|
||||
return super.read();
|
||||
}
|
||||
|
||||
// Counts the number of even or odd 00 bytes
|
||||
private int countZeros(byte[] b, int offset)
|
||||
{
|
||||
int count = 0;
|
||||
for (int i=offset; i<UTF16_READ_AHEAD_BYTES; i+=2)
|
||||
{
|
||||
if (b[i] == 0)
|
||||
{
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
};
|
||||
logger.debug("Using UTF-8");
|
||||
charset = Charset.forName("UTF-8");
|
||||
is = handleUTF8BOM(is);
|
||||
}
|
||||
logger.debug("Processing plain text in encoding " + name);
|
||||
return new InputStreamReader(is, charset);
|
||||
@ -445,4 +348,178 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
throw new IllegalArgumentException(paramName + " parameter must be an integer.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Skips the BOM character for UTF-8 encoding
|
||||
*/
|
||||
private InputStream handleUTF8BOM(InputStream is)
|
||||
{
|
||||
return new PushbackInputStream(is, UTF8_READ_AHEAD_BYTES)
|
||||
{
|
||||
boolean bomRead;
|
||||
|
||||
@Override
|
||||
public int read(byte[] bytes, int off, int len) throws IOException
|
||||
{
|
||||
int i = 0;
|
||||
int b = 0;
|
||||
for (; i < len; i++)
|
||||
{
|
||||
b = read();
|
||||
if (b == -1)
|
||||
{
|
||||
break;
|
||||
}
|
||||
bytes[off + i] = (byte) b;
|
||||
}
|
||||
return i == 0 && b == -1 ? -1 : i;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException
|
||||
{
|
||||
if (!bomRead)
|
||||
{
|
||||
bomRead = true;
|
||||
byte[] bytes = new byte[UTF8_READ_AHEAD_BYTES];
|
||||
int end = in.read(bytes, 0, UTF8_READ_AHEAD_BYTES);
|
||||
|
||||
if (bytes[0] == EF && bytes[1] == BB && bytes[2] == BF)
|
||||
{
|
||||
logger.warn("UTF-8 BOM detected, it will be skipped");
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = end - 1; i >= 0; i--)
|
||||
{
|
||||
unread(bytes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return super.read();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles the situation where there is a BOM even though the encoding indicates that normally there should not be
|
||||
* one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too which optionally has the BOM. Rather
|
||||
* than look at the BOM we look at the number of zero bytes in the first few character. XML files even when not in
|
||||
* European languages tend to have more even zero bytes when big-endian encoded and more odd zero bytes when
|
||||
* little-endian. Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but other
|
||||
* transformers do.
|
||||
*/
|
||||
private InputStream handleUTF16BOM(InputStream is)
|
||||
{
|
||||
return new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
|
||||
{
|
||||
boolean bomRead;
|
||||
boolean switchByteOrder;
|
||||
boolean evenByte = true;
|
||||
|
||||
@Override
|
||||
public int read(byte[] bytes, int off, int len) throws IOException
|
||||
{
|
||||
int i = 0;
|
||||
int b = 0;
|
||||
for (; i < len; i++)
|
||||
{
|
||||
b = read();
|
||||
if (b == -1)
|
||||
{
|
||||
break;
|
||||
}
|
||||
bytes[off + i] = (byte) b;
|
||||
}
|
||||
return i == 0 && b == -1 ? -1 : i;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException
|
||||
{
|
||||
if (!bomRead)
|
||||
{
|
||||
bomRead = true;
|
||||
boolean switchBom = false;
|
||||
byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
|
||||
int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
|
||||
int evenZeros = countZeros(bytes, 0);
|
||||
int oddZeros = countZeros(bytes, 1);
|
||||
if (evenZeros > oddZeros)
|
||||
{
|
||||
if (bytes[0] == FF && bytes[1] == FE)
|
||||
{
|
||||
switchByteOrder = true;
|
||||
switchBom = true;
|
||||
logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.debug("More even zero bytes, so normal read for big-endian");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (bytes[0] == FE && bytes[1] == FF)
|
||||
{
|
||||
switchBom = true;
|
||||
logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
|
||||
}
|
||||
else
|
||||
{
|
||||
switchByteOrder = true;
|
||||
logger.debug("More odd zero bytes, so switch bytes from little-endian");
|
||||
}
|
||||
}
|
||||
|
||||
if (switchBom)
|
||||
{
|
||||
byte b = bytes[0];
|
||||
bytes[0] = bytes[1];
|
||||
bytes[1] = b;
|
||||
}
|
||||
|
||||
for (int i = end - 1; i >= 0; i--)
|
||||
{
|
||||
unread(bytes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (switchByteOrder)
|
||||
{
|
||||
if (evenByte)
|
||||
{
|
||||
int b1 = super.read();
|
||||
int b2 = super.read();
|
||||
if (b1 != -1)
|
||||
{
|
||||
unread(b1);
|
||||
}
|
||||
if (b2 != -1)
|
||||
{
|
||||
unread(b2);
|
||||
}
|
||||
}
|
||||
evenByte = !evenByte;
|
||||
}
|
||||
|
||||
return super.read();
|
||||
}
|
||||
|
||||
// Counts the number of even or odd 00 bytes
|
||||
private int countZeros(byte[] b, int offset)
|
||||
{
|
||||
int count = 0;
|
||||
for (int i = offset; i < UTF16_READ_AHEAD_BYTES; i += 2)
|
||||
{
|
||||
if (b[i] == 0)
|
||||
{
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -134,6 +134,18 @@ public class TextToPdfContentTransformerTest
|
||||
transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUTF8WithBOM() throws Exception
|
||||
{
|
||||
transformTextAndCheck("UTF-8", null, true, "ef bb bf 31 20 49 20 6d");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUTF8WithoutBOM() throws Exception
|
||||
{
|
||||
transformTextAndCheck("UTF-8", null, false, "31 20 49 20 6d 75 73 74");
|
||||
}
|
||||
|
||||
/**
|
||||
* @param encoding to be used to read the source file
|
||||
* @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
|
||||
@ -143,7 +155,7 @@ public class TextToPdfContentTransformerTest
|
||||
* @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
|
||||
* correctly created.
|
||||
*/
|
||||
protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom,
|
||||
protected void transformTextAndCheck(String encoding, Boolean bigEndian, Boolean validBom,
|
||||
String expectedByteOrder) throws Exception
|
||||
{
|
||||
transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
|
||||
@ -244,6 +256,12 @@ public class TextToPdfContentTransformerTest
|
||||
// Use a writer to use the required encoding
|
||||
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
|
||||
{
|
||||
// Add BOM to UTF-8 file
|
||||
if (bigEndian == null && encoding != null && "UTF-8".equals(encoding.toUpperCase()) && validBom != null && validBom)
|
||||
{
|
||||
ow.append("\ufeff");
|
||||
}
|
||||
|
||||
ow.append(content);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user