diff --git a/config/quick/quick.htmlChars.eml b/config/quick/quick.htmlChars.eml new file mode 100644 index 0000000000..2019001979 --- /dev/null +++ b/config/quick/quick.htmlChars.eml @@ -0,0 +1,28 @@ +MIME-Version: 1.0 +Received: by 10.000.0.000 with HTTP; Thu, 16 Aug 2012 08:13:29 -0700 (PDT) +Date: Thu, 16 Aug 2012 16:13:29 +0100 +Delivered-To: jane.doe@alfresco.com +Message-ID: +Subject: Attachment test +From: +To: +Content-Type: multipart/alternative; + boundary="----=_NextPart_000_0005_01D06C6A.DBA98EC0" + +This is a multipart message in MIME format. + +------=_NextPart_000_0005_01D06C6A.DBA98EC0 +Content-Type: text/plain; + charset="utf-8" +Content-Transfer-Encoding: 7bit + +html special characters + +------=_NextPart_000_0005_01D06C6A.DBA98EC0 +Content-Type: text/html; + charset="utf-8" +Content-Transfer-Encoding: quoted-printable + +
html special characters
+ +------=_NextPart_000_0005_01D06C6A.DBA98EC0-- diff --git a/source/java/org/alfresco/repo/content/transform/EMLTransformer.java b/source/java/org/alfresco/repo/content/transform/EMLTransformer.java index e03494b54c..ff280d9f5c 100644 --- a/source/java/org/alfresco/repo/content/transform/EMLTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/EMLTransformer.java @@ -19,11 +19,11 @@ package org.alfresco.repo.content.transform; +import java.io.File; + import java.io.IOException; import java.io.InputStream; import java.util.Properties; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import javax.mail.MessagingException; import javax.mail.Multipart; @@ -32,9 +32,11 @@ import javax.mail.Session; import javax.mail.internet.MimeMessage; import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.repo.content.filestore.FileContentWriter; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.TransformationOptions; +import org.alfresco.util.TempFileProvider; /** @@ -48,9 +50,8 @@ import org.alfresco.service.cmr.repository.TransformationOptions; public class EMLTransformer extends AbstractContentTransformer2 { - private static final Pattern TAG_PATTERN = Pattern.compile("<[^>]*>"); - private static final String BR_TAG_PATTERN = "<[bB][rR].?\\/?>"; - private static final String NEW_LINE_PATTERN = "\n"; + private static final String CHARSET = "charset"; + private static final String DEFAULT_ENCODING = "UTF-8"; @Override public boolean isTransformableMimetype(String sourceMimetype, String targetMimetype, TransformationOptions options) @@ -144,7 +145,6 @@ public class EMLTransformer extends AbstractContentTransformer2 /** * Finds the suitable part from an multipart/alternative and appends it's text content to StringBuilder sb - * Html parts have higher priority than text parts * * @param multipart * @param sb @@ -160,10 +160,10 @@ public class EMLTransformer extends AbstractContentTransformer2 if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN)) { partToUse = part; + break; } else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML)){ partToUse = part; - break; } } if (partToUse != null) @@ -186,7 +186,7 @@ public class EMLTransformer extends AbstractContentTransformer2 boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition()); if (isAttachment) { - return; + return; } if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN)) { @@ -194,14 +194,37 @@ public class EMLTransformer extends AbstractContentTransformer2 } else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML)) { - String content = part.getContent().toString(); - //replace line breaks with new lines - content = content.replaceAll(BR_TAG_PATTERN, NEW_LINE_PATTERN); - Matcher tagMatcher = TAG_PATTERN.matcher(content); - //remove html tags - content = tagMatcher.replaceAll(""); - sb.append(content); + String mailPartContent = part.getContent().toString(); + + //create a temporary html file with same mail part content and encoding + File tempHtmlFile = TempFileProvider.createTempFile("EMLTransformer_", ".html"); + ContentWriter contentWriter = new FileContentWriter(tempHtmlFile); + contentWriter.setEncoding(getMailPartContentEncoding(part)); + contentWriter.setMimetype(MimetypeMap.MIMETYPE_HTML); + contentWriter.putContent(mailPartContent); + + //transform html file's content to plain text + EncodingAwareStringBean extractor = new EncodingAwareStringBean(); + extractor.setCollapse(false); + extractor.setLinks(false); + extractor.setReplaceNonBreakingSpaces(false); + extractor.setURL(tempHtmlFile, contentWriter.getEncoding()); + sb.append(extractor.getStrings()); + + tempHtmlFile.delete(); } } + + private String getMailPartContentEncoding(Part part) throws MessagingException + { + String encoding = DEFAULT_ENCODING; + String contentType = part.getContentType(); + int startIndex = contentType.indexOf(CHARSET); + if (startIndex > 0) + { + encoding = contentType.substring(startIndex + CHARSET.length() + 1).replaceAll("\"", ""); + } + return encoding; + } } diff --git a/source/java/org/alfresco/repo/content/transform/EncodingAwareStringBean.java b/source/java/org/alfresco/repo/content/transform/EncodingAwareStringBean.java new file mode 100644 index 0000000000..c05a531fea --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/EncodingAwareStringBean.java @@ -0,0 +1,69 @@ +package org.alfresco.repo.content.transform; + +import java.io.File; +import java.net.URLConnection; + +import org.htmlparser.Parser; +import org.htmlparser.beans.StringBean; +import org.htmlparser.util.ParserException; + +/** + * A version of {@link StringBean} which allows control of the + * encoding in the underlying HTML Parser. + * Unfortunately, StringBean doesn't allow easy over-riding of + * this, so we have to duplicate some code to control this. + * This allows us to correctly handle HTML files where the encoding + * is specified against the content property (rather than in the + * HTML Head Meta), see ALF-10466 for details. + */ +class EncodingAwareStringBean extends StringBean +{ + private static final long serialVersionUID = -9033414360428669553L; + + /** + * Sets the File to extract strings from, and the encoding + * it's in (if known to Alfresco) + * + * @param file The File that text should be fetched from. + * @param encoding The encoding of the input + */ + public void setURL(File file, String encoding) + { + String previousURL = getURL(); + String newURL = file.getAbsolutePath(); + + if ( (previousURL == null) || (!newURL.equals(previousURL)) ) + { + try + { + URLConnection conn = getConnection(); + + if (null == mParser) + { + mParser = new Parser(newURL); + } + else + { + mParser.setURL(newURL); + } + + if (encoding != null) + { + mParser.setEncoding(encoding); + } + + mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL()); + mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection()); + setStrings(); + } + catch (ParserException pe) + { + updateStrings(pe.toString()); + } + } + } + + public String getEncoding(){ + return mParser.getEncoding(); + } +} \ No newline at end of file diff --git a/source/java/org/alfresco/repo/content/transform/HtmlParserContentTransformer.java b/source/java/org/alfresco/repo/content/transform/HtmlParserContentTransformer.java index 2e7e7c81bb..42ed40aa66 100644 --- a/source/java/org/alfresco/repo/content/transform/HtmlParserContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/HtmlParserContentTransformer.java @@ -19,8 +19,6 @@ package org.alfresco.repo.content.transform; import java.io.File; -import java.net.URLConnection; -import java.util.Arrays; import org.alfresco.repo.content.MimetypeMap; import org.alfresco.service.cmr.repository.ContentReader; @@ -29,9 +27,7 @@ import org.alfresco.service.cmr.repository.TransformationOptions; import org.alfresco.util.TempFileProvider; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.htmlparser.Parser; -import org.htmlparser.beans.StringBean; -import org.htmlparser.util.ParserException; + /** * Content transformer which wraps the HTML Parser library for @@ -110,61 +106,4 @@ public class HtmlParserContentTransformer extends AbstractContentTransformer2 // Tidy up htmlFile.delete(); } - - /** - * A version of {@link StringBean} which allows control of the - * encoding in the underlying HTML Parser. - * Unfortunately, StringBean doesn't allow easy over-riding of - * this, so we have to duplicate some code to control this. - * This allows us to correctly handle HTML files where the encoding - * is specified against the content property (rather than in the - * HTML Head Meta), see ALF-10466 for details. - */ - class EncodingAwareStringBean extends StringBean - { - private static final long serialVersionUID = -9033414360428669553L; - - /** - * Sets the File to extract strings from, and the encoding - * it's in (if known to Alfresco) - * - * @param file The File that text should be fetched from. - * @param encoding The encoding of the input - */ - public void setURL(File file, String encoding) - { - String previousURL = getURL(); - String newURL = file.getAbsolutePath(); - - if ( (previousURL == null) || (!newURL.equals(previousURL)) ) - { - try - { - URLConnection conn = getConnection(); - - if (null == mParser) - { - mParser = new Parser(newURL); - } - else - { - mParser.setURL(newURL); - } - - if (encoding != null) - { - mParser.setEncoding(encoding); - } - - mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL()); - mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection()); - setStrings(); - } - catch (ParserException pe) - { - updateStrings(pe.toString()); - } - } - } - } } diff --git a/source/test-java/org/alfresco/repo/content/transform/EMLTransformerTest.java b/source/test-java/org/alfresco/repo/content/transform/EMLTransformerTest.java index 04803b1b08..742618336b 100644 --- a/source/test-java/org/alfresco/repo/content/transform/EMLTransformerTest.java +++ b/source/test-java/org/alfresco/repo/content/transform/EMLTransformerTest.java @@ -44,7 +44,9 @@ public class EMLTransformerTest extends AbstractContentTransformerTest private static final String QUICK_EML_ATTACHMENT_CONTENT = "File attachment content"; - private static final String QUICK_EML_ALTERNATIVE_CONTENT = "alternative html text"; + private static final String QUICK_EML_ALTERNATIVE_CONTENT = "alternative plain text"; + + private static final String HTML_SPACE_SPECIAL_CHAR = " "; private EMLTransformer transformer; @@ -113,7 +115,7 @@ public class EMLTransformerTest extends AbstractContentTransformerTest } /** - * Test transforming a valid eml with an attachment to text; attachment should be ingnored + * Test transforming a valid eml with an attachment to text; attachment should be ignored */ public void testRFC822WithAttachmentToText() throws Exception { @@ -152,4 +154,24 @@ public class EMLTransformerTest extends AbstractContentTransformerTest String contentStr = reader2.getContentString(); assertTrue(contentStr.contains(QUICK_EML_ALTERNATIVE_CONTENT)); } + + /** + * Test transforming a valid eml with a html part containing html special characters to text + */ + public void testHtmlSpecialCharsToText() throws Exception + { + File emlSourceFile = loadQuickTestFile("htmlChars.eml"); + File txtTargetFile = TempFileProvider.createTempFile("test5", ".txt"); + ContentReader reader = new FileContentReader(emlSourceFile); + reader.setMimetype(MimetypeMap.MIMETYPE_RFC822); + ContentWriter writer = new FileContentWriter(txtTargetFile); + writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN); + + transformer.transform(reader, writer); + + ContentReader reader2 = new FileContentReader(txtTargetFile); + reader2.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN); + String contentStr = reader2.getContentString(); + assertTrue(!contentStr.contains(HTML_SPACE_SPECIAL_CHAR)); + } }