mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-08-14 17:58:27 +00:00
[MNT-22398] Handle UTF-8 BOM
This commit is contained in:
@@ -23,6 +23,7 @@ branches:
|
|||||||
- /^HF\/.+$/
|
- /^HF\/.+$/
|
||||||
- /^ATS-.*$/
|
- /^ATS-.*$/
|
||||||
- /^ACS-.*$/
|
- /^ACS-.*$/
|
||||||
|
- /^MNT-.*$/
|
||||||
|
|
||||||
stages:
|
stages:
|
||||||
- name: Veracode Scan
|
- name: Veracode Scan
|
||||||
|
@@ -70,6 +70,11 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
|||||||
private static final byte FE = (byte) 0xFE;
|
private static final byte FE = (byte) 0xFE;
|
||||||
private static final byte FF = (byte) 0xFF;
|
private static final byte FF = (byte) 0xFF;
|
||||||
|
|
||||||
|
private static final int UTF8_READ_AHEAD_BYTES = 3;
|
||||||
|
private static final byte EF = (byte) 0xEF;
|
||||||
|
private static final byte BB = (byte) 0xBB;
|
||||||
|
private static final byte BF = (byte) 0xBF;
|
||||||
|
|
||||||
public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
|
public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
|
||||||
|
|
||||||
private final PagedTextToPDF transformer;
|
private final PagedTextToPDF transformer;
|
||||||
@@ -153,19 +158,40 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
|||||||
}
|
}
|
||||||
if (charset != null)
|
if (charset != null)
|
||||||
{
|
{
|
||||||
// Handles the situation where there is a BOM even though the encoding indicates that normally
|
|
||||||
// there should not be one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too
|
|
||||||
// which optionally has the BOM. Rather than look at the BOM we look at the number of zero bytes
|
|
||||||
// in the first few character. XML files even when not in European languages tend to have more
|
|
||||||
// even zero bytes when big-endian encoded and more odd zero bytes when little-endian.
|
|
||||||
// Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but
|
|
||||||
// other transformers do.
|
|
||||||
String name = charset.displayName();
|
String name = charset.displayName();
|
||||||
if ("UTF-16".equals(name) || "UTF-16BE".equals(name) || "UTF-16LE".equals(name))
|
if ("UTF-16".equals(name) || "UTF-16BE".equals(name) || "UTF-16LE".equals(name))
|
||||||
{
|
{
|
||||||
logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
|
logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
|
||||||
charset = Charset.forName("UTF-16");
|
charset = Charset.forName("UTF-16");
|
||||||
is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
|
is = handleUTF16BOM(is);
|
||||||
|
}
|
||||||
|
else if ("UTF-8".equals(name))
|
||||||
|
{
|
||||||
|
logger.debug("Using UTF-8");
|
||||||
|
charset = Charset.forName("UTF-8");
|
||||||
|
is = handleUTF8BOM(is);
|
||||||
|
}
|
||||||
|
logger.debug("Processing plain text in encoding " + name);
|
||||||
|
return new InputStreamReader(is, charset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back on the system default
|
||||||
|
logger.debug("Processing plain text using system default encoding");
|
||||||
|
return new InputStreamReader(is);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handles the situation where there is a BOM even though the encoding indicates that normally there should not be
|
||||||
|
* one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too which optionally has the BOM. Rather
|
||||||
|
* than look at the BOM we look at the number of zero bytes in the first few character. XML files even when not in
|
||||||
|
* European languages tend to have more even zero bytes when big-endian encoded and more odd zero bytes when
|
||||||
|
* little-endian. Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but other
|
||||||
|
* transformers do.
|
||||||
|
*/
|
||||||
|
private InputStream handleUTF16BOM(InputStream is)
|
||||||
|
{
|
||||||
|
return new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
|
||||||
{
|
{
|
||||||
boolean bomRead;
|
boolean bomRead;
|
||||||
boolean switchByteOrder;
|
boolean switchByteOrder;
|
||||||
@@ -275,14 +301,58 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
logger.debug("Processing plain text in encoding " + name);
|
|
||||||
return new InputStreamReader(is, charset);
|
/**
|
||||||
|
* Skips the BOM for UTF-8 encoding
|
||||||
|
*/
|
||||||
|
private InputStream handleUTF8BOM(InputStream is)
|
||||||
|
{
|
||||||
|
return new PushbackInputStream(is, UTF8_READ_AHEAD_BYTES)
|
||||||
|
{
|
||||||
|
boolean bomRead;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read(byte[] bytes, int off, int len) throws IOException
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
int b = 0;
|
||||||
|
for (; i < len; i++)
|
||||||
|
{
|
||||||
|
b = read();
|
||||||
|
if (b == -1)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
bytes[off + i] = (byte) b;
|
||||||
|
}
|
||||||
|
return i == 0 && b == -1 ? -1 : i;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read() throws IOException
|
||||||
|
{
|
||||||
|
if (!bomRead)
|
||||||
|
{
|
||||||
|
bomRead = true;
|
||||||
|
byte[] bytes = new byte[UTF8_READ_AHEAD_BYTES];
|
||||||
|
int end = in.read(bytes, 0, UTF8_READ_AHEAD_BYTES);
|
||||||
|
|
||||||
|
if (bytes[0] == EF && bytes[1] == BB && bytes[2] == BF)
|
||||||
|
{
|
||||||
|
logger.warn("UTF-8 BOM detected");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int i = end - 1; i >= 0; i--)
|
||||||
|
{
|
||||||
|
unread(bytes[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fall back on the system default
|
return super.read();
|
||||||
logger.debug("Processing plain text using system default encoding");
|
}
|
||||||
return new InputStreamReader(is);
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class PagedTextToPDF extends TextToPDF
|
private static class PagedTextToPDF extends TextToPDF
|
||||||
|
@@ -134,6 +134,18 @@ public class TextToPdfContentTransformerTest
|
|||||||
transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
|
transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testUTF8WithBOM() throws Exception
|
||||||
|
{
|
||||||
|
transformTextAndCheck("UTF-8", null, true, "ef bb bf 31 20 49 20 6d");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testUTF8WithoutBOM() throws Exception
|
||||||
|
{
|
||||||
|
transformTextAndCheck("UTF-8", null, false, "31 20 49 20 6d 75 73 74");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param encoding to be used to read the source file
|
* @param encoding to be used to read the source file
|
||||||
* @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
|
* @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
|
||||||
@@ -143,7 +155,7 @@ public class TextToPdfContentTransformerTest
|
|||||||
* @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
|
* @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
|
||||||
* correctly created.
|
* correctly created.
|
||||||
*/
|
*/
|
||||||
protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom,
|
protected void transformTextAndCheck(String encoding, Boolean bigEndian, Boolean validBom,
|
||||||
String expectedByteOrder) throws Exception
|
String expectedByteOrder) throws Exception
|
||||||
{
|
{
|
||||||
transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
|
transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
|
||||||
@@ -244,6 +256,12 @@ public class TextToPdfContentTransformerTest
|
|||||||
// Use a writer to use the required encoding
|
// Use a writer to use the required encoding
|
||||||
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
|
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
|
||||||
{
|
{
|
||||||
|
// Add BOM to UTF-8 file
|
||||||
|
if (bigEndian == null && encoding != null && "UTF-8".equals(encoding.toUpperCase()) && validBom != null && validBom)
|
||||||
|
{
|
||||||
|
ow.append("\ufeff");
|
||||||
|
}
|
||||||
|
|
||||||
ow.append(content);
|
ow.append(content);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user