MNT-20626 XML files having UTF-16LE and UTF-16BE can't be previewed (#331)

* Introduce more flexible reading of UTF-16 data, where there may be a BOM, but the
   spec says there should not be one, or the BOM is clearly wrong when looking at the
   following characters. The https://en.wikipedia.org/wiki/UTF-16 write up is nice and clear.
* Includes identical correction in data setup in AIOTransformRegistryTest and
   MicsControllerTest for a problem found in TextToPdfContentTransformerTest.
 * Includes upgrade to latest pdfbox: 2.0.22
This commit is contained in:
Alan Davis
2021-02-09 19:04:34 +00:00
committed by GitHub
parent 97b9fc39cf
commit 2766c23431
4 changed files with 349 additions and 23 deletions

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -280,7 +280,7 @@ public class AIOTransformRegistryTest
int cutoff = pageLimit * pageLength;
for (int i = 1; i <= lines; i++)
{
sb.append(i);
sb.append(Integer.toString(i));
sb.append(" I must not talk in class or feed my homework to my cat.\n");
if (i == cutoff)
checkText = sb.toString();

View File

@@ -408,7 +408,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
StringBuilder sb = new StringBuilder();
for (int i = 1; i <= 5; i++)
{
sb.append(i);
sb.append(Integer.toString(i));
sb.append(" I must not talk in class or feed my homework to my cat.\n");
}
sb.append("\nBart\n");

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -44,6 +44,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.HashMap;
@@ -63,6 +64,10 @@ public class TextToPdfContentTransformer implements SelectableTransformer
{
private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class);
private static final int UTF16_READ_AHEAD_BYTES = 16; // 8 characters including BOM if it exists
private static final byte FE = (byte) 0xFE;
private static final byte FF = (byte) 0xFF;
public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
private final PagedTextToPDF transformer;
@@ -146,7 +151,129 @@ public class TextToPdfContentTransformer implements SelectableTransformer
}
if (charset != null)
{
logger.debug("Processing plain text in encoding " + charset.displayName());
// Handles the situation where there is a BOM even though the encoding indicates that normally
// there should not be one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too
// which optionally has the BOM. Rather than look at the BOM we look at the number of zero bytes
// in the first few character. XML files even when not in European languages tend to have more
// even zero bytes when big-endian encoded and more odd zero bytes when little-endian.
// Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but
// other transformers do.
String name = charset.displayName();
if ("UTF-16".equals(name) || "UTF-16BE".equals(name) || "UTF-16LE".equals(name))
{
logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
charset = Charset.forName("UTF-16");
is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
{
boolean bomRead;
boolean switchByteOrder;
boolean evenByte = true;
@Override
public int read(byte[] bytes, int off, int len) throws IOException
{
int i = 0;
int b = 0;
for (; i<len; i++)
{
b = read();
if (b == -1)
{
break;
}
bytes[off+i] = (byte)b;
}
return i == 0 && b == -1 ? -1 : i;
}
@Override
public int read() throws IOException
{
if (!bomRead)
{
bomRead = true;
boolean switchBom = false;
byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
int evenZeros = countZeros(bytes, 0);
int oddZeros = countZeros(bytes, 1);
if (evenZeros > oddZeros)
{
if (bytes[0] == FF && bytes[1] == FE)
{
switchByteOrder = true;
switchBom = true;
logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
}
else
{
logger.debug("More even zero bytes, so normal read for big-endian");
}
}
else
{
if (bytes[0] == FE && bytes[1] == FF)
{
switchBom = true;
logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
}
else
{
switchByteOrder = true;
logger.debug("More odd zero bytes, so switch bytes from little-endian");
}
}
if (switchBom)
{
byte b = bytes[0];
bytes[0] = bytes[1];
bytes[1] = b;
}
for (int i = end-1; i>=0; i--)
{
unread(bytes[i]);
}
}
if (switchByteOrder)
{
if (evenByte)
{
int b1 = super.read();
int b2 = super.read();
if (b1 != -1)
{
unread(b1);
}
if (b2 != -1)
{
unread(b2);
}
}
evenByte = !evenByte;
}
return super.read();
}
// Counts the number of even or odd 00 bytes
private int countZeros(byte[] b, int offset)
{
int count = 0;
for (int i=offset; i<UTF16_READ_AHEAD_BYTES; i+=2)
{
if (b[i] == 0)
{
count++;
}
}
return count;
}
};
}
logger.debug("Processing plain text in encoding " + name);
return new InputStreamReader(is, charset);
}
}
@@ -196,7 +323,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
public PDDocument createPDFFromText(Reader text, int pageLimit)
throws IOException
{
//int pageLimit = (int)pageLimits.getValue();
PDDocument doc = null;
int pageCount = 0;
try
@@ -207,7 +333,7 @@ public class TextToPdfContentTransformer implements SelectableTransformer
//calculate font height and increase by 5 percent.
height = height * getFontSize() * 1.05f;
doc = new PDDocument();
BufferedReader data = new BufferedReader(text);
BufferedReader data = (text instanceof BufferedReader) ? (BufferedReader) text : new BufferedReader(text);
String nextLine;
PDPage page = new PDPage();
PDPageContentStream contentStream = null;
@@ -220,7 +346,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
outer:
while ((nextLine = data.readLine()) != null)
{
// The input text is nonEmpty. New pages will be created and added
// to the PDF document as they are needed, depending on the length of
// the text.
@@ -252,8 +377,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
int test = pageCount + 1;
if (pageLimit > 0 && (pageCount++ >= pageLimit))
{
// pageLimits.getAction().throwIOExceptionIfRequired("Page limit ("+pageLimit+
// ") reached.", transformerDebug);
break outer;
}
@@ -272,7 +395,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
y = page.getMediaBox().getHeight() - margin + height;
contentStream.moveTextPositionByAmount(margin, y);
}
//System.out.println( "Drawing string at " + x + "," + y );
if (contentStream == null)
{

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -31,14 +31,20 @@ import org.apache.pdfbox.text.PDFTextStripper;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.Map;
import static org.alfresco.transformer.util.RequestParamMap.PAGE_LIMIT;
import static org.alfresco.transformer.util.RequestParamMap.SOURCE_ENCODING;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class TextToPdfContentTransformerTest
@@ -76,39 +82,125 @@ public class TextToPdfContentTransformerTest
transformTextAndCheckPageLength(50);
}
private void transformTextAndCheckPageLength(int pageLimit) throws Exception
@Test
public void test1UTF16BigEndianBomBigEndianChars() throws Exception
{
// 1. BOM indicates BE (fe then ff) + chars appear to be BE (as first byte read tends to be a zero)
// Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE
String expectedByteOrder = "fe ff 00 31 00 20 00 49";
transformTextAndCheck("UTF-16", true, true, expectedByteOrder);
transformTextAndCheck("UTF-16", true, true, expectedByteOrder);
transformTextAndCheck("UTF-16BE", true, true, expectedByteOrder);
transformTextAndCheck("UTF-16LE", true, true, expectedByteOrder);
}
@Test
public void test2UTF16LittleEndianBomLittleEndianChars() throws Exception
{
// 2. BOM indicates LE (ff then fe) + chars appear to be LE (as second byte read tends to be a zero)
// Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE
transformTextAndCheck("UTF-16", false, true, "ff fe 31 00 20 00 49 00");
}
@Test
public void test3UTF16NoBomBigEndianChars() throws Exception
{
// 3. No BOM + chars appear to be BE (as first byte read tends to be a zero)
// Expected with UTF-16BE
transformTextAndCheck("UTF-16", true, null, "00 31 00 20 00 49");
}
@Test
public void test4UTF16NoBomLittleEndianChars() throws Exception
{
// 4. No BOM + chars appear to be LE (as second byte read tends to be a zero)
// Expected with UTF-16LE
transformTextAndCheck("UTF-16", false, null, "31 00 20 00 49 00");
}
@Test
public void test5UTF16BigEndianBomLittleEndianChars() throws Exception
{
// 5. BOM indicates BE (fe then ff) + chars appear to be LE (as second byte read tends to be a zero)
// SOMETHING IS WRONG, BUT USE LE!!!!
transformTextAndCheck("UTF-16", false, false, "fe ff 31 00 20 00 49 00");
}
@Test
public void test6UTF16LittleEndianBomBigEndianChars() throws Exception
{
// 6. BOM indicates LE (ff then fe) + chars appear to be BE (as first byte read tends to be a zero)
// SOMETHING IS WRONG, BUT USE BE!!!!
transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
}
/**
* @param encoding to be used to read the source file
* @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
* each char is a zero when using English.
* @param validBom if not null, the BOM is included. If true it is the one matching bigEndian. If false it is the
* opposite byte order, which really is an error, but we try to recover from it.
* @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
* correctly created.
*/
protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom,
String expectedByteOrder) throws Exception
{
transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
}
protected void transformTextAndCheckPageLength(int pageLimit) throws Exception
{
transformTextAndCheckImpl(pageLimit, "UTF-8", null, null, null);
}
private void transformTextAndCheckImpl(int pageLimit, String encoding, Boolean bigEndian, Boolean validBom,
String expectedByteOrder) throws Exception
{
StringBuilder sb = new StringBuilder();
String checkText = createTestText(pageLimit, sb);
String text = sb.toString();
File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
writeToFile(sourceFile, text, encoding, bigEndian, validBom);
checkFileBytes(sourceFile, expectedByteOrder);
transformTextAndCheck(sourceFile, encoding, checkText, String.valueOf(pageLimit));
}
private String createTestText(int pageLimit, StringBuilder sb)
{
int pageLength = 32;
int lines = (pageLength + 10) * ((pageLimit > 0) ? pageLimit : 1);
StringBuilder sb = new StringBuilder();
String checkText = null;
int cutoff = pageLimit * pageLength;
for (int i = 1; i <= lines; i++)
{
sb.append(i);
sb.append(Integer.toString(i));
sb.append(" I must not talk in class or feed my homework to my cat.\n");
if (i == cutoff)
{
checkText = sb.toString();
}
}
sb.append("\nBart\n");
String text = sb.toString();
checkText = (checkText == null) ? clean(text) : clean(checkText);
transformTextAndCheck(text, "UTF-8", checkText, String.valueOf(pageLimit));
checkText = checkText == null ? clean(text) : clean(checkText);
return checkText;
}
private void transformTextAndCheck(String text, String encoding, String checkText,
private void transformTextAndCheck(File sourceFile, String encoding, String checkText,
String pageLimit) throws Exception
{
// Get a reader for the text
File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
writeToFile(sourceFile, text, encoding);
// And a temp writer
File targetFile = File.createTempFile("AlfrescoTestTarget_", ".pdf");
// Transform to PDF
Map<String, String> parameters = new HashMap<>();
parameters.put(PAGE_LIMIT, pageLimit);
parameters.put(SOURCE_ENCODING, encoding);
transformer.transform("text/plain", "application/pdf", parameters, sourceFile, targetFile);
// Read back in the PDF and check it
@@ -138,11 +230,123 @@ public class TextToPdfContentTransformerTest
return text;
}
private void writeToFile(File file, String content, String encoding) throws Exception
private void writeToFile(File file, String content, String encoding, Boolean bigEndian, Boolean validBom) throws Exception
{
// If we may have to change the endian or include/exclude the BOM, write initially to a tmp file using
// UTF-16 which includes the BOM FEFF.
File originalFile = file;
if (bigEndian != null)
{
file = File.createTempFile("AlfrescoTestTmpSrc_", ".txt");
encoding = "UTF-16";
}
// Use a writer to use the required encoding
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
{
ow.append(content);
}
// If we may have to change the endian or include/exclude the BOM, copy the raw bytes to the supplied file
if (bigEndian != null)
{
boolean firstRead = true;
byte[] bytes = new byte[8192];
try (InputStream is = new BufferedInputStream(new FileInputStream(file));
OutputStream os = new BufferedOutputStream(new FileOutputStream(originalFile)))
{
int l;
int off;
boolean switchBytes = false;
do
{
l = is.read(bytes);
off = 0;
// When we read the first block, change the offset if we don't want the BOM and also work out
// if the byte endian need to be switch. The source bytes allways start with a standard BOM.
if (firstRead)
{
firstRead = false;
boolean actualEndianBytes = bytes[0] == (byte)0xfe; // if true [1] would also be 0xff
switchBytes = actualEndianBytes != bigEndian;
if (validBom == null)
{
// Strip the BOM
off = 2;
}
else if (!validBom)
{
// Reverse the BOM so it does not match the characters!
byte aByte = bytes[0];
bytes[0] = bytes[1];
bytes[1] = aByte;
}
}
int len = l - off;
if (len > 0)
{
if (switchBytes)
{
// Reverse the byte order of characters including the BOM.
for (int i=0; i<l; i+=2)
{
byte aByte = bytes[i];
bytes[i] = bytes[i+1];
bytes[i+1] = aByte;
}
}
os.write(bytes, off, len-off);
}
} while (l != -1);
}
}
}
/**
* Check the first few bytes in the source file match what we planed to use later as test data.
*/
private void checkFileBytes(File sourceFile, String expectedByteOrder) throws Exception
{
if (expectedByteOrder != null)
{
byte[] expectedBytes = hexToBytes(expectedByteOrder); // new BigInteger(expectedByteOrder,16).toByteArray();
int l = expectedBytes.length;
byte[] actualBytes = new byte[l];
FileInputStream is = new FileInputStream(sourceFile);
is.read(actualBytes, 0, l);
String actualByteOrder = bytesToHex(actualBytes);
assertEquals(expectedByteOrder, actualByteOrder, "The sourceFile does not contain the expected bytes");
}
}
private byte[] hexToBytes(String hexString)
{
hexString = hexString.replaceAll(" *", "");
int len = hexString.length() / 2;
byte[] bytes = new byte[len];
for (int j=0, i=0; i<len; i++)
{
int firstDigit = Character.digit(hexString.charAt(j++), 16);
int secondDigit = Character.digit(hexString.charAt(j++), 16);
bytes[i] = (byte)((firstDigit << 4) + secondDigit);
}
return bytes;
}
private String bytesToHex(byte[] bytes)
{
StringBuffer sb = new StringBuffer();
int len = bytes.length;
for (int i=0; i<len; i++)
{
if (sb.length() > 0)
{
sb.append(' ');
}
sb.append(Character.forDigit((bytes[i] >> 4) & 0xF, 16));
sb.append(Character.forDigit((bytes[i] & 0xF), 16));
}
return sb.toString();
}
}