mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-09-17 14:21:18 +00:00
MNT-20626 XML files having UTF-16LE and UTF-16BE can't be previewed (#331)
* Introduce more flexible reading of UTF-16 data, where there may be a BOM, but the spec says there should not be one, or the BOM is clearly wrong when looking at the following characters. The https://en.wikipedia.org/wiki/UTF-16 write up is nice and clear. * Includes identical correction in data setup in AIOTransformRegistryTest and MicsControllerTest for a problem found in TextToPdfContentTransformerTest. * Includes upgrade to latest pdfbox: 2.0.22
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -280,7 +280,7 @@ public class AIOTransformRegistryTest
|
||||
int cutoff = pageLimit * pageLength;
|
||||
for (int i = 1; i <= lines; i++)
|
||||
{
|
||||
sb.append(i);
|
||||
sb.append(Integer.toString(i));
|
||||
sb.append(" I must not talk in class or feed my homework to my cat.\n");
|
||||
if (i == cutoff)
|
||||
checkText = sb.toString();
|
||||
|
@@ -408,7 +408,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 1; i <= 5; i++)
|
||||
{
|
||||
sb.append(i);
|
||||
sb.append(Integer.toString(i));
|
||||
sb.append(" I must not talk in class or feed my homework to my cat.\n");
|
||||
}
|
||||
sb.append("\nBart\n");
|
||||
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -44,6 +44,7 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStream;
|
||||
import java.io.PushbackInputStream;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
@@ -63,6 +64,10 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class);
|
||||
|
||||
private static final int UTF16_READ_AHEAD_BYTES = 16; // 8 characters including BOM if it exists
|
||||
private static final byte FE = (byte) 0xFE;
|
||||
private static final byte FF = (byte) 0xFF;
|
||||
|
||||
public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
|
||||
|
||||
private final PagedTextToPDF transformer;
|
||||
@@ -146,7 +151,129 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
}
|
||||
if (charset != null)
|
||||
{
|
||||
logger.debug("Processing plain text in encoding " + charset.displayName());
|
||||
// Handles the situation where there is a BOM even though the encoding indicates that normally
|
||||
// there should not be one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too
|
||||
// which optionally has the BOM. Rather than look at the BOM we look at the number of zero bytes
|
||||
// in the first few character. XML files even when not in European languages tend to have more
|
||||
// even zero bytes when big-endian encoded and more odd zero bytes when little-endian.
|
||||
// Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but
|
||||
// other transformers do.
|
||||
String name = charset.displayName();
|
||||
if ("UTF-16".equals(name) || "UTF-16BE".equals(name) || "UTF-16LE".equals(name))
|
||||
{
|
||||
logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
|
||||
charset = Charset.forName("UTF-16");
|
||||
is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
|
||||
{
|
||||
boolean bomRead;
|
||||
boolean switchByteOrder;
|
||||
boolean evenByte = true;
|
||||
|
||||
@Override
|
||||
public int read(byte[] bytes, int off, int len) throws IOException
|
||||
{
|
||||
int i = 0;
|
||||
int b = 0;
|
||||
for (; i<len; i++)
|
||||
{
|
||||
b = read();
|
||||
if (b == -1)
|
||||
{
|
||||
break;
|
||||
}
|
||||
bytes[off+i] = (byte)b;
|
||||
}
|
||||
return i == 0 && b == -1 ? -1 : i;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException
|
||||
{
|
||||
if (!bomRead)
|
||||
{
|
||||
bomRead = true;
|
||||
boolean switchBom = false;
|
||||
byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
|
||||
int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
|
||||
int evenZeros = countZeros(bytes, 0);
|
||||
int oddZeros = countZeros(bytes, 1);
|
||||
if (evenZeros > oddZeros)
|
||||
{
|
||||
if (bytes[0] == FF && bytes[1] == FE)
|
||||
{
|
||||
switchByteOrder = true;
|
||||
switchBom = true;
|
||||
logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.debug("More even zero bytes, so normal read for big-endian");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (bytes[0] == FE && bytes[1] == FF)
|
||||
{
|
||||
switchBom = true;
|
||||
logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
|
||||
}
|
||||
else
|
||||
{
|
||||
switchByteOrder = true;
|
||||
logger.debug("More odd zero bytes, so switch bytes from little-endian");
|
||||
}
|
||||
}
|
||||
|
||||
if (switchBom)
|
||||
{
|
||||
byte b = bytes[0];
|
||||
bytes[0] = bytes[1];
|
||||
bytes[1] = b;
|
||||
}
|
||||
|
||||
for (int i = end-1; i>=0; i--)
|
||||
{
|
||||
unread(bytes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (switchByteOrder)
|
||||
{
|
||||
if (evenByte)
|
||||
{
|
||||
int b1 = super.read();
|
||||
int b2 = super.read();
|
||||
if (b1 != -1)
|
||||
{
|
||||
unread(b1);
|
||||
}
|
||||
if (b2 != -1)
|
||||
{
|
||||
unread(b2);
|
||||
}
|
||||
}
|
||||
evenByte = !evenByte;
|
||||
}
|
||||
|
||||
return super.read();
|
||||
}
|
||||
|
||||
// Counts the number of even or odd 00 bytes
|
||||
private int countZeros(byte[] b, int offset)
|
||||
{
|
||||
int count = 0;
|
||||
for (int i=offset; i<UTF16_READ_AHEAD_BYTES; i+=2)
|
||||
{
|
||||
if (b[i] == 0)
|
||||
{
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
};
|
||||
}
|
||||
logger.debug("Processing plain text in encoding " + name);
|
||||
return new InputStreamReader(is, charset);
|
||||
}
|
||||
}
|
||||
@@ -196,7 +323,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
public PDDocument createPDFFromText(Reader text, int pageLimit)
|
||||
throws IOException
|
||||
{
|
||||
//int pageLimit = (int)pageLimits.getValue();
|
||||
PDDocument doc = null;
|
||||
int pageCount = 0;
|
||||
try
|
||||
@@ -207,7 +333,7 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
//calculate font height and increase by 5 percent.
|
||||
height = height * getFontSize() * 1.05f;
|
||||
doc = new PDDocument();
|
||||
BufferedReader data = new BufferedReader(text);
|
||||
BufferedReader data = (text instanceof BufferedReader) ? (BufferedReader) text : new BufferedReader(text);
|
||||
String nextLine;
|
||||
PDPage page = new PDPage();
|
||||
PDPageContentStream contentStream = null;
|
||||
@@ -220,7 +346,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
outer:
|
||||
while ((nextLine = data.readLine()) != null)
|
||||
{
|
||||
|
||||
// The input text is nonEmpty. New pages will be created and added
|
||||
// to the PDF document as they are needed, depending on the length of
|
||||
// the text.
|
||||
@@ -252,8 +377,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
int test = pageCount + 1;
|
||||
if (pageLimit > 0 && (pageCount++ >= pageLimit))
|
||||
{
|
||||
// pageLimits.getAction().throwIOExceptionIfRequired("Page limit ("+pageLimit+
|
||||
// ") reached.", transformerDebug);
|
||||
break outer;
|
||||
}
|
||||
|
||||
@@ -272,7 +395,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
y = page.getMediaBox().getHeight() - margin + height;
|
||||
contentStream.moveTextPositionByAmount(margin, y);
|
||||
}
|
||||
//System.out.println( "Drawing string at " + x + "," + y );
|
||||
|
||||
if (contentStream == null)
|
||||
{
|
||||
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -31,14 +31,20 @@ import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.StringWriter;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transformer.util.RequestParamMap.PAGE_LIMIT;
|
||||
import static org.alfresco.transformer.util.RequestParamMap.SOURCE_ENCODING;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class TextToPdfContentTransformerTest
|
||||
@@ -76,39 +82,125 @@ public class TextToPdfContentTransformerTest
|
||||
transformTextAndCheckPageLength(50);
|
||||
}
|
||||
|
||||
private void transformTextAndCheckPageLength(int pageLimit) throws Exception
|
||||
@Test
|
||||
public void test1UTF16BigEndianBomBigEndianChars() throws Exception
|
||||
{
|
||||
// 1. BOM indicates BE (fe then ff) + chars appear to be BE (as first byte read tends to be a zero)
|
||||
// Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE
|
||||
String expectedByteOrder = "fe ff 00 31 00 20 00 49";
|
||||
transformTextAndCheck("UTF-16", true, true, expectedByteOrder);
|
||||
transformTextAndCheck("UTF-16", true, true, expectedByteOrder);
|
||||
transformTextAndCheck("UTF-16BE", true, true, expectedByteOrder);
|
||||
transformTextAndCheck("UTF-16LE", true, true, expectedByteOrder);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test2UTF16LittleEndianBomLittleEndianChars() throws Exception
|
||||
{
|
||||
// 2. BOM indicates LE (ff then fe) + chars appear to be LE (as second byte read tends to be a zero)
|
||||
// Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE
|
||||
transformTextAndCheck("UTF-16", false, true, "ff fe 31 00 20 00 49 00");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test3UTF16NoBomBigEndianChars() throws Exception
|
||||
{
|
||||
// 3. No BOM + chars appear to be BE (as first byte read tends to be a zero)
|
||||
// Expected with UTF-16BE
|
||||
transformTextAndCheck("UTF-16", true, null, "00 31 00 20 00 49");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test4UTF16NoBomLittleEndianChars() throws Exception
|
||||
{
|
||||
// 4. No BOM + chars appear to be LE (as second byte read tends to be a zero)
|
||||
// Expected with UTF-16LE
|
||||
transformTextAndCheck("UTF-16", false, null, "31 00 20 00 49 00");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test5UTF16BigEndianBomLittleEndianChars() throws Exception
|
||||
{
|
||||
// 5. BOM indicates BE (fe then ff) + chars appear to be LE (as second byte read tends to be a zero)
|
||||
// SOMETHING IS WRONG, BUT USE LE!!!!
|
||||
transformTextAndCheck("UTF-16", false, false, "fe ff 31 00 20 00 49 00");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test6UTF16LittleEndianBomBigEndianChars() throws Exception
|
||||
{
|
||||
// 6. BOM indicates LE (ff then fe) + chars appear to be BE (as first byte read tends to be a zero)
|
||||
// SOMETHING IS WRONG, BUT USE BE!!!!
|
||||
transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
|
||||
}
|
||||
|
||||
/**
|
||||
* @param encoding to be used to read the source file
|
||||
* @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
|
||||
* each char is a zero when using English.
|
||||
* @param validBom if not null, the BOM is included. If true it is the one matching bigEndian. If false it is the
|
||||
* opposite byte order, which really is an error, but we try to recover from it.
|
||||
* @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
|
||||
* correctly created.
|
||||
*/
|
||||
protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom,
|
||||
String expectedByteOrder) throws Exception
|
||||
{
|
||||
transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
|
||||
}
|
||||
|
||||
protected void transformTextAndCheckPageLength(int pageLimit) throws Exception
|
||||
{
|
||||
transformTextAndCheckImpl(pageLimit, "UTF-8", null, null, null);
|
||||
}
|
||||
|
||||
private void transformTextAndCheckImpl(int pageLimit, String encoding, Boolean bigEndian, Boolean validBom,
|
||||
String expectedByteOrder) throws Exception
|
||||
{
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String checkText = createTestText(pageLimit, sb);
|
||||
String text = sb.toString();
|
||||
|
||||
File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
|
||||
writeToFile(sourceFile, text, encoding, bigEndian, validBom);
|
||||
checkFileBytes(sourceFile, expectedByteOrder);
|
||||
|
||||
transformTextAndCheck(sourceFile, encoding, checkText, String.valueOf(pageLimit));
|
||||
}
|
||||
|
||||
private String createTestText(int pageLimit, StringBuilder sb)
|
||||
{
|
||||
int pageLength = 32;
|
||||
int lines = (pageLength + 10) * ((pageLimit > 0) ? pageLimit : 1);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String checkText = null;
|
||||
int cutoff = pageLimit * pageLength;
|
||||
for (int i = 1; i <= lines; i++)
|
||||
{
|
||||
sb.append(i);
|
||||
sb.append(Integer.toString(i));
|
||||
sb.append(" I must not talk in class or feed my homework to my cat.\n");
|
||||
if (i == cutoff)
|
||||
{
|
||||
checkText = sb.toString();
|
||||
}
|
||||
}
|
||||
sb.append("\nBart\n");
|
||||
|
||||
String text = sb.toString();
|
||||
checkText = (checkText == null) ? clean(text) : clean(checkText);
|
||||
transformTextAndCheck(text, "UTF-8", checkText, String.valueOf(pageLimit));
|
||||
checkText = checkText == null ? clean(text) : clean(checkText);
|
||||
|
||||
return checkText;
|
||||
}
|
||||
|
||||
private void transformTextAndCheck(String text, String encoding, String checkText,
|
||||
private void transformTextAndCheck(File sourceFile, String encoding, String checkText,
|
||||
String pageLimit) throws Exception
|
||||
{
|
||||
// Get a reader for the text
|
||||
File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
|
||||
writeToFile(sourceFile, text, encoding);
|
||||
|
||||
// And a temp writer
|
||||
File targetFile = File.createTempFile("AlfrescoTestTarget_", ".pdf");
|
||||
|
||||
// Transform to PDF
|
||||
Map<String, String> parameters = new HashMap<>();
|
||||
parameters.put(PAGE_LIMIT, pageLimit);
|
||||
parameters.put(SOURCE_ENCODING, encoding);
|
||||
transformer.transform("text/plain", "application/pdf", parameters, sourceFile, targetFile);
|
||||
|
||||
// Read back in the PDF and check it
|
||||
@@ -138,11 +230,123 @@ public class TextToPdfContentTransformerTest
|
||||
return text;
|
||||
}
|
||||
|
||||
private void writeToFile(File file, String content, String encoding) throws Exception
|
||||
private void writeToFile(File file, String content, String encoding, Boolean bigEndian, Boolean validBom) throws Exception
|
||||
{
|
||||
// If we may have to change the endian or include/exclude the BOM, write initially to a tmp file using
|
||||
// UTF-16 which includes the BOM FEFF.
|
||||
File originalFile = file;
|
||||
if (bigEndian != null)
|
||||
{
|
||||
file = File.createTempFile("AlfrescoTestTmpSrc_", ".txt");
|
||||
encoding = "UTF-16";
|
||||
}
|
||||
|
||||
// Use a writer to use the required encoding
|
||||
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
|
||||
{
|
||||
ow.append(content);
|
||||
}
|
||||
|
||||
// If we may have to change the endian or include/exclude the BOM, copy the raw bytes to the supplied file
|
||||
if (bigEndian != null)
|
||||
{
|
||||
boolean firstRead = true;
|
||||
byte[] bytes = new byte[8192];
|
||||
try (InputStream is = new BufferedInputStream(new FileInputStream(file));
|
||||
OutputStream os = new BufferedOutputStream(new FileOutputStream(originalFile)))
|
||||
{
|
||||
int l;
|
||||
int off;
|
||||
boolean switchBytes = false;
|
||||
do
|
||||
{
|
||||
l = is.read(bytes);
|
||||
off = 0;
|
||||
// When we read the first block, change the offset if we don't want the BOM and also work out
|
||||
// if the byte endian need to be switch. The source bytes allways start with a standard BOM.
|
||||
if (firstRead)
|
||||
{
|
||||
firstRead = false;
|
||||
boolean actualEndianBytes = bytes[0] == (byte)0xfe; // if true [1] would also be 0xff
|
||||
switchBytes = actualEndianBytes != bigEndian;
|
||||
if (validBom == null)
|
||||
{
|
||||
// Strip the BOM
|
||||
off = 2;
|
||||
}
|
||||
else if (!validBom)
|
||||
{
|
||||
// Reverse the BOM so it does not match the characters!
|
||||
byte aByte = bytes[0];
|
||||
bytes[0] = bytes[1];
|
||||
bytes[1] = aByte;
|
||||
}
|
||||
}
|
||||
int len = l - off;
|
||||
if (len > 0)
|
||||
{
|
||||
if (switchBytes)
|
||||
{
|
||||
// Reverse the byte order of characters including the BOM.
|
||||
for (int i=0; i<l; i+=2)
|
||||
{
|
||||
byte aByte = bytes[i];
|
||||
bytes[i] = bytes[i+1];
|
||||
bytes[i+1] = aByte;
|
||||
}
|
||||
}
|
||||
os.write(bytes, off, len-off);
|
||||
}
|
||||
} while (l != -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check the first few bytes in the source file match what we planed to use later as test data.
|
||||
*/
|
||||
private void checkFileBytes(File sourceFile, String expectedByteOrder) throws Exception
|
||||
{
|
||||
if (expectedByteOrder != null)
|
||||
{
|
||||
byte[] expectedBytes = hexToBytes(expectedByteOrder); // new BigInteger(expectedByteOrder,16).toByteArray();
|
||||
int l = expectedBytes.length;
|
||||
byte[] actualBytes = new byte[l];
|
||||
|
||||
FileInputStream is = new FileInputStream(sourceFile);
|
||||
is.read(actualBytes, 0, l);
|
||||
String actualByteOrder = bytesToHex(actualBytes);
|
||||
assertEquals(expectedByteOrder, actualByteOrder, "The sourceFile does not contain the expected bytes");
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] hexToBytes(String hexString)
|
||||
{
|
||||
hexString = hexString.replaceAll(" *", "");
|
||||
int len = hexString.length() / 2;
|
||||
byte[] bytes = new byte[len];
|
||||
for (int j=0, i=0; i<len; i++)
|
||||
{
|
||||
int firstDigit = Character.digit(hexString.charAt(j++), 16);
|
||||
int secondDigit = Character.digit(hexString.charAt(j++), 16);
|
||||
bytes[i] = (byte)((firstDigit << 4) + secondDigit);
|
||||
}
|
||||
return bytes;
|
||||
}
|
||||
|
||||
private String bytesToHex(byte[] bytes)
|
||||
{
|
||||
StringBuffer sb = new StringBuffer();
|
||||
int len = bytes.length;
|
||||
for (int i=0; i<len; i++)
|
||||
{
|
||||
if (sb.length() > 0)
|
||||
{
|
||||
sb.append(' ');
|
||||
}
|
||||
sb.append(Character.forDigit((bytes[i] >> 4) & 0xF, 16));
|
||||
sb.append(Character.forDigit((bytes[i] & 0xF), 16));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user