mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-09-17 14:21:18 +00:00
MNT-20626 XML files having UTF-16LE and UTF-16BE can't be previewed (#331)
* Introduce more flexible reading of UTF-16 data, where there may be a BOM, but the spec says there should not be one, or the BOM is clearly wrong when looking at the following characters. The https://en.wikipedia.org/wiki/UTF-16 write up is nice and clear. * Includes identical correction in data setup in AIOTransformRegistryTest and MicsControllerTest for a problem found in TextToPdfContentTransformerTest. * Includes upgrade to latest pdfbox: 2.0.22
This commit is contained in:
@@ -2,7 +2,7 @@
|
|||||||
* #%L
|
* #%L
|
||||||
* Alfresco Transform Core
|
* Alfresco Transform Core
|
||||||
* %%
|
* %%
|
||||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||||
* %%
|
* %%
|
||||||
* This file is part of the Alfresco software.
|
* This file is part of the Alfresco software.
|
||||||
* -
|
* -
|
||||||
@@ -280,7 +280,7 @@ public class AIOTransformRegistryTest
|
|||||||
int cutoff = pageLimit * pageLength;
|
int cutoff = pageLimit * pageLength;
|
||||||
for (int i = 1; i <= lines; i++)
|
for (int i = 1; i <= lines; i++)
|
||||||
{
|
{
|
||||||
sb.append(i);
|
sb.append(Integer.toString(i));
|
||||||
sb.append(" I must not talk in class or feed my homework to my cat.\n");
|
sb.append(" I must not talk in class or feed my homework to my cat.\n");
|
||||||
if (i == cutoff)
|
if (i == cutoff)
|
||||||
checkText = sb.toString();
|
checkText = sb.toString();
|
||||||
|
@@ -408,7 +408,7 @@ public class MiscControllerTest extends AbstractTransformerControllerTest
|
|||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
for (int i = 1; i <= 5; i++)
|
for (int i = 1; i <= 5; i++)
|
||||||
{
|
{
|
||||||
sb.append(i);
|
sb.append(Integer.toString(i));
|
||||||
sb.append(" I must not talk in class or feed my homework to my cat.\n");
|
sb.append(" I must not talk in class or feed my homework to my cat.\n");
|
||||||
}
|
}
|
||||||
sb.append("\nBart\n");
|
sb.append("\nBart\n");
|
||||||
|
@@ -2,7 +2,7 @@
|
|||||||
* #%L
|
* #%L
|
||||||
* Alfresco Transform Core
|
* Alfresco Transform Core
|
||||||
* %%
|
* %%
|
||||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||||
* %%
|
* %%
|
||||||
* This file is part of the Alfresco software.
|
* This file is part of the Alfresco software.
|
||||||
* -
|
* -
|
||||||
@@ -44,6 +44,7 @@ import java.io.IOException;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
|
import java.io.PushbackInputStream;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@@ -63,6 +64,10 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
|||||||
{
|
{
|
||||||
private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class);
|
private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class);
|
||||||
|
|
||||||
|
private static final int UTF16_READ_AHEAD_BYTES = 16; // 8 characters including BOM if it exists
|
||||||
|
private static final byte FE = (byte) 0xFE;
|
||||||
|
private static final byte FF = (byte) 0xFF;
|
||||||
|
|
||||||
public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
|
public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
|
||||||
|
|
||||||
private final PagedTextToPDF transformer;
|
private final PagedTextToPDF transformer;
|
||||||
@@ -146,7 +151,129 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
|||||||
}
|
}
|
||||||
if (charset != null)
|
if (charset != null)
|
||||||
{
|
{
|
||||||
logger.debug("Processing plain text in encoding " + charset.displayName());
|
// Handles the situation where there is a BOM even though the encoding indicates that normally
|
||||||
|
// there should not be one for UTF-16BE and UTF-16LE. For extra flexibility includes UTF-16 too
|
||||||
|
// which optionally has the BOM. Rather than look at the BOM we look at the number of zero bytes
|
||||||
|
// in the first few character. XML files even when not in European languages tend to have more
|
||||||
|
// even zero bytes when big-endian encoded and more odd zero bytes when little-endian.
|
||||||
|
// Think of: <?xml version="1.0"?> The normal Java decoder does not have this flexibility but
|
||||||
|
// other transformers do.
|
||||||
|
String name = charset.displayName();
|
||||||
|
if ("UTF-16".equals(name) || "UTF-16BE".equals(name) || "UTF-16LE".equals(name))
|
||||||
|
{
|
||||||
|
logger.debug("Handle big and little endian UTF-16 text. Using UTF-16 rather than encoding " + name);
|
||||||
|
charset = Charset.forName("UTF-16");
|
||||||
|
is = new PushbackInputStream(is, UTF16_READ_AHEAD_BYTES)
|
||||||
|
{
|
||||||
|
boolean bomRead;
|
||||||
|
boolean switchByteOrder;
|
||||||
|
boolean evenByte = true;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read(byte[] bytes, int off, int len) throws IOException
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
int b = 0;
|
||||||
|
for (; i<len; i++)
|
||||||
|
{
|
||||||
|
b = read();
|
||||||
|
if (b == -1)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
bytes[off+i] = (byte)b;
|
||||||
|
}
|
||||||
|
return i == 0 && b == -1 ? -1 : i;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read() throws IOException
|
||||||
|
{
|
||||||
|
if (!bomRead)
|
||||||
|
{
|
||||||
|
bomRead = true;
|
||||||
|
boolean switchBom = false;
|
||||||
|
byte[] bytes = new byte[UTF16_READ_AHEAD_BYTES];
|
||||||
|
int end = in.read(bytes, 0, UTF16_READ_AHEAD_BYTES);
|
||||||
|
int evenZeros = countZeros(bytes, 0);
|
||||||
|
int oddZeros = countZeros(bytes, 1);
|
||||||
|
if (evenZeros > oddZeros)
|
||||||
|
{
|
||||||
|
if (bytes[0] == FF && bytes[1] == FE)
|
||||||
|
{
|
||||||
|
switchByteOrder = true;
|
||||||
|
switchBom = true;
|
||||||
|
logger.warn("Little-endian BOM FFFE read, but characters are big-endian");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
logger.debug("More even zero bytes, so normal read for big-endian");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (bytes[0] == FE && bytes[1] == FF)
|
||||||
|
{
|
||||||
|
switchBom = true;
|
||||||
|
logger.debug("Big-endian BOM FEFF read, but characters are little-endian");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
switchByteOrder = true;
|
||||||
|
logger.debug("More odd zero bytes, so switch bytes from little-endian");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (switchBom)
|
||||||
|
{
|
||||||
|
byte b = bytes[0];
|
||||||
|
bytes[0] = bytes[1];
|
||||||
|
bytes[1] = b;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = end-1; i>=0; i--)
|
||||||
|
{
|
||||||
|
unread(bytes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (switchByteOrder)
|
||||||
|
{
|
||||||
|
if (evenByte)
|
||||||
|
{
|
||||||
|
int b1 = super.read();
|
||||||
|
int b2 = super.read();
|
||||||
|
if (b1 != -1)
|
||||||
|
{
|
||||||
|
unread(b1);
|
||||||
|
}
|
||||||
|
if (b2 != -1)
|
||||||
|
{
|
||||||
|
unread(b2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
evenByte = !evenByte;
|
||||||
|
}
|
||||||
|
|
||||||
|
return super.read();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Counts the number of even or odd 00 bytes
|
||||||
|
private int countZeros(byte[] b, int offset)
|
||||||
|
{
|
||||||
|
int count = 0;
|
||||||
|
for (int i=offset; i<UTF16_READ_AHEAD_BYTES; i+=2)
|
||||||
|
{
|
||||||
|
if (b[i] == 0)
|
||||||
|
{
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
logger.debug("Processing plain text in encoding " + name);
|
||||||
return new InputStreamReader(is, charset);
|
return new InputStreamReader(is, charset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -196,7 +323,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
|||||||
public PDDocument createPDFFromText(Reader text, int pageLimit)
|
public PDDocument createPDFFromText(Reader text, int pageLimit)
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
//int pageLimit = (int)pageLimits.getValue();
|
|
||||||
PDDocument doc = null;
|
PDDocument doc = null;
|
||||||
int pageCount = 0;
|
int pageCount = 0;
|
||||||
try
|
try
|
||||||
@@ -207,7 +333,7 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
|||||||
//calculate font height and increase by 5 percent.
|
//calculate font height and increase by 5 percent.
|
||||||
height = height * getFontSize() * 1.05f;
|
height = height * getFontSize() * 1.05f;
|
||||||
doc = new PDDocument();
|
doc = new PDDocument();
|
||||||
BufferedReader data = new BufferedReader(text);
|
BufferedReader data = (text instanceof BufferedReader) ? (BufferedReader) text : new BufferedReader(text);
|
||||||
String nextLine;
|
String nextLine;
|
||||||
PDPage page = new PDPage();
|
PDPage page = new PDPage();
|
||||||
PDPageContentStream contentStream = null;
|
PDPageContentStream contentStream = null;
|
||||||
@@ -220,7 +346,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
|||||||
outer:
|
outer:
|
||||||
while ((nextLine = data.readLine()) != null)
|
while ((nextLine = data.readLine()) != null)
|
||||||
{
|
{
|
||||||
|
|
||||||
// The input text is nonEmpty. New pages will be created and added
|
// The input text is nonEmpty. New pages will be created and added
|
||||||
// to the PDF document as they are needed, depending on the length of
|
// to the PDF document as they are needed, depending on the length of
|
||||||
// the text.
|
// the text.
|
||||||
@@ -252,8 +377,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
|||||||
int test = pageCount + 1;
|
int test = pageCount + 1;
|
||||||
if (pageLimit > 0 && (pageCount++ >= pageLimit))
|
if (pageLimit > 0 && (pageCount++ >= pageLimit))
|
||||||
{
|
{
|
||||||
// pageLimits.getAction().throwIOExceptionIfRequired("Page limit ("+pageLimit+
|
|
||||||
// ") reached.", transformerDebug);
|
|
||||||
break outer;
|
break outer;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -272,7 +395,6 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
|||||||
y = page.getMediaBox().getHeight() - margin + height;
|
y = page.getMediaBox().getHeight() - margin + height;
|
||||||
contentStream.moveTextPositionByAmount(margin, y);
|
contentStream.moveTextPositionByAmount(margin, y);
|
||||||
}
|
}
|
||||||
//System.out.println( "Drawing string at " + x + "," + y );
|
|
||||||
|
|
||||||
if (contentStream == null)
|
if (contentStream == null)
|
||||||
{
|
{
|
||||||
|
@@ -2,7 +2,7 @@
|
|||||||
* #%L
|
* #%L
|
||||||
* Alfresco Transform Core
|
* Alfresco Transform Core
|
||||||
* %%
|
* %%
|
||||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||||
* %%
|
* %%
|
||||||
* This file is part of the Alfresco software.
|
* This file is part of the Alfresco software.
|
||||||
* -
|
* -
|
||||||
@@ -31,14 +31,20 @@ import org.apache.pdfbox.text.PDFTextStripper;
|
|||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.BufferedOutputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
import java.io.OutputStreamWriter;
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import static org.alfresco.transformer.util.RequestParamMap.PAGE_LIMIT;
|
import static org.alfresco.transformer.util.RequestParamMap.PAGE_LIMIT;
|
||||||
|
import static org.alfresco.transformer.util.RequestParamMap.SOURCE_ENCODING;
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
public class TextToPdfContentTransformerTest
|
public class TextToPdfContentTransformerTest
|
||||||
@@ -76,39 +82,125 @@ public class TextToPdfContentTransformerTest
|
|||||||
transformTextAndCheckPageLength(50);
|
transformTextAndCheckPageLength(50);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void transformTextAndCheckPageLength(int pageLimit) throws Exception
|
@Test
|
||||||
|
public void test1UTF16BigEndianBomBigEndianChars() throws Exception
|
||||||
|
{
|
||||||
|
// 1. BOM indicates BE (fe then ff) + chars appear to be BE (as first byte read tends to be a zero)
|
||||||
|
// Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE
|
||||||
|
String expectedByteOrder = "fe ff 00 31 00 20 00 49";
|
||||||
|
transformTextAndCheck("UTF-16", true, true, expectedByteOrder);
|
||||||
|
transformTextAndCheck("UTF-16", true, true, expectedByteOrder);
|
||||||
|
transformTextAndCheck("UTF-16BE", true, true, expectedByteOrder);
|
||||||
|
transformTextAndCheck("UTF-16LE", true, true, expectedByteOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test2UTF16LittleEndianBomLittleEndianChars() throws Exception
|
||||||
|
{
|
||||||
|
// 2. BOM indicates LE (ff then fe) + chars appear to be LE (as second byte read tends to be a zero)
|
||||||
|
// Expected with UTF-16. Some systems use BE and other like Windows and Mac used LE
|
||||||
|
transformTextAndCheck("UTF-16", false, true, "ff fe 31 00 20 00 49 00");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test3UTF16NoBomBigEndianChars() throws Exception
|
||||||
|
{
|
||||||
|
// 3. No BOM + chars appear to be BE (as first byte read tends to be a zero)
|
||||||
|
// Expected with UTF-16BE
|
||||||
|
transformTextAndCheck("UTF-16", true, null, "00 31 00 20 00 49");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test4UTF16NoBomLittleEndianChars() throws Exception
|
||||||
|
{
|
||||||
|
// 4. No BOM + chars appear to be LE (as second byte read tends to be a zero)
|
||||||
|
// Expected with UTF-16LE
|
||||||
|
transformTextAndCheck("UTF-16", false, null, "31 00 20 00 49 00");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test5UTF16BigEndianBomLittleEndianChars() throws Exception
|
||||||
|
{
|
||||||
|
// 5. BOM indicates BE (fe then ff) + chars appear to be LE (as second byte read tends to be a zero)
|
||||||
|
// SOMETHING IS WRONG, BUT USE LE!!!!
|
||||||
|
transformTextAndCheck("UTF-16", false, false, "fe ff 31 00 20 00 49 00");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test6UTF16LittleEndianBomBigEndianChars() throws Exception
|
||||||
|
{
|
||||||
|
// 6. BOM indicates LE (ff then fe) + chars appear to be BE (as first byte read tends to be a zero)
|
||||||
|
// SOMETHING IS WRONG, BUT USE BE!!!!
|
||||||
|
transformTextAndCheck("UTF-16", true, false, "ff fe 00 31 00 20 00 49");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param encoding to be used to read the source file
|
||||||
|
* @param bigEndian indicates that the file should contain big endian characters, so typically the first byte of
|
||||||
|
* each char is a zero when using English.
|
||||||
|
* @param validBom if not null, the BOM is included. If true it is the one matching bigEndian. If false it is the
|
||||||
|
* opposite byte order, which really is an error, but we try to recover from it.
|
||||||
|
* @param expectedByteOrder The first few bytes of the source file so we can check the test data has been
|
||||||
|
* correctly created.
|
||||||
|
*/
|
||||||
|
protected void transformTextAndCheck(String encoding, boolean bigEndian, Boolean validBom,
|
||||||
|
String expectedByteOrder) throws Exception
|
||||||
|
{
|
||||||
|
transformTextAndCheckImpl(-1, encoding, bigEndian, validBom, expectedByteOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void transformTextAndCheckPageLength(int pageLimit) throws Exception
|
||||||
|
{
|
||||||
|
transformTextAndCheckImpl(pageLimit, "UTF-8", null, null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void transformTextAndCheckImpl(int pageLimit, String encoding, Boolean bigEndian, Boolean validBom,
|
||||||
|
String expectedByteOrder) throws Exception
|
||||||
|
{
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
String checkText = createTestText(pageLimit, sb);
|
||||||
|
String text = sb.toString();
|
||||||
|
|
||||||
|
File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
|
||||||
|
writeToFile(sourceFile, text, encoding, bigEndian, validBom);
|
||||||
|
checkFileBytes(sourceFile, expectedByteOrder);
|
||||||
|
|
||||||
|
transformTextAndCheck(sourceFile, encoding, checkText, String.valueOf(pageLimit));
|
||||||
|
}
|
||||||
|
|
||||||
|
private String createTestText(int pageLimit, StringBuilder sb)
|
||||||
{
|
{
|
||||||
int pageLength = 32;
|
int pageLength = 32;
|
||||||
int lines = (pageLength + 10) * ((pageLimit > 0) ? pageLimit : 1);
|
int lines = (pageLength + 10) * ((pageLimit > 0) ? pageLimit : 1);
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
String checkText = null;
|
String checkText = null;
|
||||||
int cutoff = pageLimit * pageLength;
|
int cutoff = pageLimit * pageLength;
|
||||||
for (int i = 1; i <= lines; i++)
|
for (int i = 1; i <= lines; i++)
|
||||||
{
|
{
|
||||||
sb.append(i);
|
sb.append(Integer.toString(i));
|
||||||
sb.append(" I must not talk in class or feed my homework to my cat.\n");
|
sb.append(" I must not talk in class or feed my homework to my cat.\n");
|
||||||
if (i == cutoff)
|
if (i == cutoff)
|
||||||
|
{
|
||||||
checkText = sb.toString();
|
checkText = sb.toString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
sb.append("\nBart\n");
|
sb.append("\nBart\n");
|
||||||
|
|
||||||
String text = sb.toString();
|
String text = sb.toString();
|
||||||
checkText = (checkText == null) ? clean(text) : clean(checkText);
|
checkText = checkText == null ? clean(text) : clean(checkText);
|
||||||
transformTextAndCheck(text, "UTF-8", checkText, String.valueOf(pageLimit));
|
|
||||||
|
return checkText;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void transformTextAndCheck(String text, String encoding, String checkText,
|
private void transformTextAndCheck(File sourceFile, String encoding, String checkText,
|
||||||
String pageLimit) throws Exception
|
String pageLimit) throws Exception
|
||||||
{
|
{
|
||||||
// Get a reader for the text
|
|
||||||
File sourceFile = File.createTempFile("AlfrescoTestSource_", ".txt");
|
|
||||||
writeToFile(sourceFile, text, encoding);
|
|
||||||
|
|
||||||
// And a temp writer
|
// And a temp writer
|
||||||
File targetFile = File.createTempFile("AlfrescoTestTarget_", ".pdf");
|
File targetFile = File.createTempFile("AlfrescoTestTarget_", ".pdf");
|
||||||
|
|
||||||
// Transform to PDF
|
// Transform to PDF
|
||||||
Map<String, String> parameters = new HashMap<>();
|
Map<String, String> parameters = new HashMap<>();
|
||||||
parameters.put(PAGE_LIMIT, pageLimit);
|
parameters.put(PAGE_LIMIT, pageLimit);
|
||||||
|
parameters.put(SOURCE_ENCODING, encoding);
|
||||||
transformer.transform("text/plain", "application/pdf", parameters, sourceFile, targetFile);
|
transformer.transform("text/plain", "application/pdf", parameters, sourceFile, targetFile);
|
||||||
|
|
||||||
// Read back in the PDF and check it
|
// Read back in the PDF and check it
|
||||||
@@ -138,11 +230,123 @@ public class TextToPdfContentTransformerTest
|
|||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeToFile(File file, String content, String encoding) throws Exception
|
private void writeToFile(File file, String content, String encoding, Boolean bigEndian, Boolean validBom) throws Exception
|
||||||
{
|
{
|
||||||
|
// If we may have to change the endian or include/exclude the BOM, write initially to a tmp file using
|
||||||
|
// UTF-16 which includes the BOM FEFF.
|
||||||
|
File originalFile = file;
|
||||||
|
if (bigEndian != null)
|
||||||
|
{
|
||||||
|
file = File.createTempFile("AlfrescoTestTmpSrc_", ".txt");
|
||||||
|
encoding = "UTF-16";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use a writer to use the required encoding
|
||||||
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
|
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
|
||||||
{
|
{
|
||||||
ow.append(content);
|
ow.append(content);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we may have to change the endian or include/exclude the BOM, copy the raw bytes to the supplied file
|
||||||
|
if (bigEndian != null)
|
||||||
|
{
|
||||||
|
boolean firstRead = true;
|
||||||
|
byte[] bytes = new byte[8192];
|
||||||
|
try (InputStream is = new BufferedInputStream(new FileInputStream(file));
|
||||||
|
OutputStream os = new BufferedOutputStream(new FileOutputStream(originalFile)))
|
||||||
|
{
|
||||||
|
int l;
|
||||||
|
int off;
|
||||||
|
boolean switchBytes = false;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
l = is.read(bytes);
|
||||||
|
off = 0;
|
||||||
|
// When we read the first block, change the offset if we don't want the BOM and also work out
|
||||||
|
// if the byte endian need to be switch. The source bytes allways start with a standard BOM.
|
||||||
|
if (firstRead)
|
||||||
|
{
|
||||||
|
firstRead = false;
|
||||||
|
boolean actualEndianBytes = bytes[0] == (byte)0xfe; // if true [1] would also be 0xff
|
||||||
|
switchBytes = actualEndianBytes != bigEndian;
|
||||||
|
if (validBom == null)
|
||||||
|
{
|
||||||
|
// Strip the BOM
|
||||||
|
off = 2;
|
||||||
|
}
|
||||||
|
else if (!validBom)
|
||||||
|
{
|
||||||
|
// Reverse the BOM so it does not match the characters!
|
||||||
|
byte aByte = bytes[0];
|
||||||
|
bytes[0] = bytes[1];
|
||||||
|
bytes[1] = aByte;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int len = l - off;
|
||||||
|
if (len > 0)
|
||||||
|
{
|
||||||
|
if (switchBytes)
|
||||||
|
{
|
||||||
|
// Reverse the byte order of characters including the BOM.
|
||||||
|
for (int i=0; i<l; i+=2)
|
||||||
|
{
|
||||||
|
byte aByte = bytes[i];
|
||||||
|
bytes[i] = bytes[i+1];
|
||||||
|
bytes[i+1] = aByte;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
os.write(bytes, off, len-off);
|
||||||
|
}
|
||||||
|
} while (l != -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check the first few bytes in the source file match what we planed to use later as test data.
|
||||||
|
*/
|
||||||
|
private void checkFileBytes(File sourceFile, String expectedByteOrder) throws Exception
|
||||||
|
{
|
||||||
|
if (expectedByteOrder != null)
|
||||||
|
{
|
||||||
|
byte[] expectedBytes = hexToBytes(expectedByteOrder); // new BigInteger(expectedByteOrder,16).toByteArray();
|
||||||
|
int l = expectedBytes.length;
|
||||||
|
byte[] actualBytes = new byte[l];
|
||||||
|
|
||||||
|
FileInputStream is = new FileInputStream(sourceFile);
|
||||||
|
is.read(actualBytes, 0, l);
|
||||||
|
String actualByteOrder = bytesToHex(actualBytes);
|
||||||
|
assertEquals(expectedByteOrder, actualByteOrder, "The sourceFile does not contain the expected bytes");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] hexToBytes(String hexString)
|
||||||
|
{
|
||||||
|
hexString = hexString.replaceAll(" *", "");
|
||||||
|
int len = hexString.length() / 2;
|
||||||
|
byte[] bytes = new byte[len];
|
||||||
|
for (int j=0, i=0; i<len; i++)
|
||||||
|
{
|
||||||
|
int firstDigit = Character.digit(hexString.charAt(j++), 16);
|
||||||
|
int secondDigit = Character.digit(hexString.charAt(j++), 16);
|
||||||
|
bytes[i] = (byte)((firstDigit << 4) + secondDigit);
|
||||||
|
}
|
||||||
|
return bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String bytesToHex(byte[] bytes)
|
||||||
|
{
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
int len = bytes.length;
|
||||||
|
for (int i=0; i<len; i++)
|
||||||
|
{
|
||||||
|
if (sb.length() > 0)
|
||||||
|
{
|
||||||
|
sb.append(' ');
|
||||||
|
}
|
||||||
|
sb.append(Character.forDigit((bytes[i] >> 4) & 0xF, 16));
|
||||||
|
sb.append(Character.forDigit((bytes[i] & 0xF), 16));
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user