diff --git a/.travis.yml b/.travis.yml index dd243fdf..9d40d693 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,6 +23,7 @@ branches: - /^HF\/.+$/ - /^ATS-.*$/ - /^ACS-.*$/ + - /^MNT-.*$/ stages: - name: Veracode Scan diff --git a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java index 253d1c88..444f669a 100644 --- a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java +++ b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/TextToPdfContentTransformer.java @@ -70,6 +70,11 @@ public class TextToPdfContentTransformer implements SelectableTransformer private static final byte FE = (byte) 0xFE; private static final byte FF = (byte) 0xFF; + private static final int UTF8_READ_AHEAD_BYTES = 3; + private static final byte EF = (byte) 0xEF; + private static final byte BB = (byte) 0xBB; + private static final byte BF = (byte) 0xBF; + public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT; private final PagedTextToPDF transformer; @@ -165,115 +170,13 @@ public class TextToPdfContentTransformer implements SelectableTransformer { logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name); charset = Charset.forName("UTF-16"); - is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES) - { - boolean bomRead; - boolean switchByteOrder; - boolean evenByte = true; - - @Override - public int read(byte[] bytes, int off, int len) throws IOException - { - int i = 0; - int b = 0; - for (; i oddZeros) - { - if (bytes[0] == FF && bytes[1] == FE) - { - switchByteOrder = true; - switchBom = true; - logger.warn("Little-endian BOM FFFE read, but characters are big-endian"); - } - else - { - logger.debug("More even zero bytes, so normal read for big-endian"); - } - } - else - { - if (bytes[0] == FE && bytes[1] == FF) - { - switchBom = true; - logger.debug("Big-endian BOM FEFF read, but characters are little-endian"); - } - else - { - switchByteOrder = true; - logger.debug("More odd zero bytes, so switch bytes from little-endian"); - } - } - - if (switchBom) - { - byte b = bytes[0]; - bytes[0] = bytes[1]; - bytes[1] = b; - } - - for (int i = end-1; i>=0; i--) - { - unread(bytes[i]); - } - } - - if (switchByteOrder) - { - if (evenByte) - { - int b1 = super.read(); - int b2 = super.read(); - if (b1 != -1) - { - unread(b1); - } - if (b2 != -1) - { - unread(b2); - } - } - evenByte = !evenByte; - } - - return super.read(); - } - - // Counts the number of even or odd 00 bytes - private int countZeros(byte[] b, int offset) - { - int count = 0; - for (int i=offset; i= 0; i--) + { + unread(bytes[i]); + } + } + } + + return super.read(); + } + }; + } + + /** + * Handles the situation where there is a BOM even though the encoding indicates that normally there should not be + * one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too which optionally has the BOM. Rather + * than look at the BOM we look at the number of zero bytes in the first few character. XML files even when not in + * European languages tend to have more even zero bytes when big-endian encoded and more odd zero bytes when + * little-endian. Think of: The normal Java decoder does not have this flexibility but other + * transformers do. + */ + private InputStream handleUTF16BOM(InputStream is) + { + return new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES) + { + boolean bomRead; + boolean switchByteOrder; + boolean evenByte = true; + + @Override + public int read(byte[] bytes, int off, int len) throws IOException + { + int i = 0; + int b = 0; + for (; i < len; i++) + { + b = read(); + if (b == -1) + { + break; + } + bytes[off + i] = (byte) b; + } + return i == 0 && b == -1 ? -1 : i; + } + + @Override + public int read() throws IOException + { + if (!bomRead) + { + bomRead = true; + boolean switchBom = false; + byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES]; + int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES); + int evenZeros = countZeros(bytes, 0); + int oddZeros = countZeros(bytes, 1); + if (evenZeros > oddZeros) + { + if (bytes[0] == FF && bytes[1] == FE) + { + switchByteOrder = true; + switchBom = true; + logger.warn("Little-endian BOM FFFE read, but characters are big-endian"); + } + else + { + logger.debug("More even zero bytes, so normal read for big-endian"); + } + } + else + { + if (bytes[0] == FE && bytes[1] == FF) + { + switchBom = true; + logger.debug("Big-endian BOM FEFF read, but characters are little-endian"); + } + else + { + switchByteOrder = true; + logger.debug("More odd zero bytes, so switch bytes from little-endian"); + } + } + + if (switchBom) + { + byte b = bytes[0]; + bytes[0] = bytes[1]; + bytes[1] = b; + } + + for (int i = end - 1; i >= 0; i--) + { + unread(bytes[i]); + } + } + + if (switchByteOrder) + { + if (evenByte) + { + int b1 = super.read(); + int b2 = super.read(); + if (b1 != -1) + { + unread(b1); + } + if (b2 != -1) + { + unread(b2); + } + } + evenByte = !evenByte; + } + + return super.read(); + } + + // Counts the number of even or odd 00 bytes + private int countZeros(byte[] b, int offset) + { + int count = 0; + for (int i = offset; i < UTF16_READ_AHEAD_BYTES; i += 2) + { + if (b[i] == 0) + { + count++; + } + } + return count; + } + }; + } } diff --git a/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java b/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java index 5fa08d79..807863d3 100644 --- a/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java +++ b/alfresco-transform-misc/alfresco-transform-misc/src/test/java/org/alfresco/transformer/transformers/TextToPdfContentTransformerTest.java @@ -134,6 +134,18 @@ public class TextToPdfContentTransformerTest transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49"); } + @Test + public void testUTF8WithBOM() throws Exception + { + transformTextAndCheck("UTF-8", null, true, "ef bb bf 31 20 49 20 6d"); + } + + @Test + public void testUTF8WithoutBOM() throws Exception + { + transformTextAndCheck("UTF-8", null, false, "31 20 49 20 6d 75 73 74"); + } + /** * @param encoding to be used to read the source file * @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of @@ -143,7 +155,7 @@ public class TextToPdfContentTransformerTest * @param expectedByteOrder The first few bytes of the source file so we can check the test data has been * correctly created. */ - protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom, + protected void transformTextAndCheck(String encoding, Boolean bigEndian, Boolean validBom, String expectedByteOrder) throws Exception { transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder); @@ -244,6 +256,12 @@ public class TextToPdfContentTransformerTest // Use a writer to use the required encoding try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding)) { + // Add BOM to UTF-8 file + if (bigEndian == null && encoding != null && "UTF-8".equals(encoding.toUpperCase()) && validBom != null && validBom) + { + ow.append("\ufeff"); + } + ow.append(content); }