diff --git a/config/quick/quick.htmlChars.eml b/config/quick/quick.htmlChars.eml
new file mode 100644
index 0000000000..2019001979
--- /dev/null
+++ b/config/quick/quick.htmlChars.eml
@@ -0,0 +1,28 @@
+MIME-Version: 1.0
+Received: by 10.000.0.000 with HTTP; Thu, 16 Aug 2012 08:13:29 -0700 (PDT)
+Date: Thu, 16 Aug 2012 16:13:29 +0100
+Delivered-To: jane.doe@alfresco.com
+Message-ID:
+Subject: Attachment test
+From:
+To:
+Content-Type: multipart/alternative;
+ boundary="----=_NextPart_000_0005_01D06C6A.DBA98EC0"
+
+This is a multipart message in MIME format.
+
+------=_NextPart_000_0005_01D06C6A.DBA98EC0
+Content-Type: text/plain;
+ charset="utf-8"
+Content-Transfer-Encoding: 7bit
+
+html special characters
+
+------=_NextPart_000_0005_01D06C6A.DBA98EC0
+Content-Type: text/html;
+ charset="utf-8"
+Content-Transfer-Encoding: quoted-printable
+
+html special characters
+
+------=_NextPart_000_0005_01D06C6A.DBA98EC0--
diff --git a/source/java/org/alfresco/repo/content/transform/EMLTransformer.java b/source/java/org/alfresco/repo/content/transform/EMLTransformer.java
index e03494b54c..ff280d9f5c 100644
--- a/source/java/org/alfresco/repo/content/transform/EMLTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/EMLTransformer.java
@@ -19,11 +19,11 @@
package org.alfresco.repo.content.transform;
+import java.io.File;
+
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import javax.mail.MessagingException;
import javax.mail.Multipart;
@@ -32,9 +32,11 @@ import javax.mail.Session;
import javax.mail.internet.MimeMessage;
import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.repo.content.filestore.FileContentWriter;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
+import org.alfresco.util.TempFileProvider;
/**
@@ -48,9 +50,8 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
public class EMLTransformer extends AbstractContentTransformer2
{
- private static final Pattern TAG_PATTERN = Pattern.compile("<[^>]*>");
- private static final String BR_TAG_PATTERN = "<[bB][rR].?\\/?>";
- private static final String NEW_LINE_PATTERN = "\n";
+ private static final String CHARSET = "charset";
+ private static final String DEFAULT_ENCODING = "UTF-8";
@Override
public boolean isTransformableMimetype(String sourceMimetype, String targetMimetype, TransformationOptions options)
@@ -144,7 +145,6 @@ public class EMLTransformer extends AbstractContentTransformer2
/**
* Finds the suitable part from an multipart/alternative and appends it's text content to StringBuilder sb
- * Html parts have higher priority than text parts
*
* @param multipart
* @param sb
@@ -160,10 +160,10 @@ public class EMLTransformer extends AbstractContentTransformer2
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
{
partToUse = part;
+ break;
}
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML)){
partToUse = part;
- break;
}
}
if (partToUse != null)
@@ -186,7 +186,7 @@ public class EMLTransformer extends AbstractContentTransformer2
boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition());
if (isAttachment)
{
- return;
+ return;
}
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
{
@@ -194,14 +194,37 @@ public class EMLTransformer extends AbstractContentTransformer2
}
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML))
{
- String content = part.getContent().toString();
- //replace line breaks with new lines
- content = content.replaceAll(BR_TAG_PATTERN, NEW_LINE_PATTERN);
- Matcher tagMatcher = TAG_PATTERN.matcher(content);
- //remove html tags
- content = tagMatcher.replaceAll("");
- sb.append(content);
+ String mailPartContent = part.getContent().toString();
+
+ //create a temporary html file with same mail part content and encoding
+ File tempHtmlFile = TempFileProvider.createTempFile("EMLTransformer_", ".html");
+ ContentWriter contentWriter = new FileContentWriter(tempHtmlFile);
+ contentWriter.setEncoding(getMailPartContentEncoding(part));
+ contentWriter.setMimetype(MimetypeMap.MIMETYPE_HTML);
+ contentWriter.putContent(mailPartContent);
+
+ //transform html file's content to plain text
+ EncodingAwareStringBean extractor = new EncodingAwareStringBean();
+ extractor.setCollapse(false);
+ extractor.setLinks(false);
+ extractor.setReplaceNonBreakingSpaces(false);
+ extractor.setURL(tempHtmlFile, contentWriter.getEncoding());
+ sb.append(extractor.getStrings());
+
+ tempHtmlFile.delete();
}
}
+
+ private String getMailPartContentEncoding(Part part) throws MessagingException
+ {
+ String encoding = DEFAULT_ENCODING;
+ String contentType = part.getContentType();
+ int startIndex = contentType.indexOf(CHARSET);
+ if (startIndex > 0)
+ {
+ encoding = contentType.substring(startIndex + CHARSET.length() + 1).replaceAll("\"", "");
+ }
+ return encoding;
+ }
}
diff --git a/source/java/org/alfresco/repo/content/transform/EncodingAwareStringBean.java b/source/java/org/alfresco/repo/content/transform/EncodingAwareStringBean.java
new file mode 100644
index 0000000000..c05a531fea
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/transform/EncodingAwareStringBean.java
@@ -0,0 +1,69 @@
+package org.alfresco.repo.content.transform;
+
+import java.io.File;
+import java.net.URLConnection;
+
+import org.htmlparser.Parser;
+import org.htmlparser.beans.StringBean;
+import org.htmlparser.util.ParserException;
+
+/**
+ * A version of {@link StringBean} which allows control of the
+ * encoding in the underlying HTML Parser.
+ * Unfortunately, StringBean doesn't allow easy over-riding of
+ * this, so we have to duplicate some code to control this.
+ * This allows us to correctly handle HTML files where the encoding
+ * is specified against the content property (rather than in the
+ * HTML Head Meta), see ALF-10466 for details.
+ */
+class EncodingAwareStringBean extends StringBean
+{
+ private static final long serialVersionUID = -9033414360428669553L;
+
+ /**
+ * Sets the File to extract strings from, and the encoding
+ * it's in (if known to Alfresco)
+ *
+ * @param file The File that text should be fetched from.
+ * @param encoding The encoding of the input
+ */
+ public void setURL(File file, String encoding)
+ {
+ String previousURL = getURL();
+ String newURL = file.getAbsolutePath();
+
+ if ( (previousURL == null) || (!newURL.equals(previousURL)) )
+ {
+ try
+ {
+ URLConnection conn = getConnection();
+
+ if (null == mParser)
+ {
+ mParser = new Parser(newURL);
+ }
+ else
+ {
+ mParser.setURL(newURL);
+ }
+
+ if (encoding != null)
+ {
+ mParser.setEncoding(encoding);
+ }
+
+ mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
+ mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
+ setStrings();
+ }
+ catch (ParserException pe)
+ {
+ updateStrings(pe.toString());
+ }
+ }
+ }
+
+ public String getEncoding(){
+ return mParser.getEncoding();
+ }
+}
\ No newline at end of file
diff --git a/source/java/org/alfresco/repo/content/transform/HtmlParserContentTransformer.java b/source/java/org/alfresco/repo/content/transform/HtmlParserContentTransformer.java
index 2e7e7c81bb..42ed40aa66 100644
--- a/source/java/org/alfresco/repo/content/transform/HtmlParserContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/HtmlParserContentTransformer.java
@@ -19,8 +19,6 @@
package org.alfresco.repo.content.transform;
import java.io.File;
-import java.net.URLConnection;
-import java.util.Arrays;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
@@ -29,9 +27,7 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
import org.alfresco.util.TempFileProvider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.htmlparser.Parser;
-import org.htmlparser.beans.StringBean;
-import org.htmlparser.util.ParserException;
+
/**
* Content transformer which wraps the HTML Parser library for
@@ -110,61 +106,4 @@ public class HtmlParserContentTransformer extends AbstractContentTransformer2
// Tidy up
htmlFile.delete();
}
-
- /**
- * A version of {@link StringBean} which allows control of the
- * encoding in the underlying HTML Parser.
- * Unfortunately, StringBean doesn't allow easy over-riding of
- * this, so we have to duplicate some code to control this.
- * This allows us to correctly handle HTML files where the encoding
- * is specified against the content property (rather than in the
- * HTML Head Meta), see ALF-10466 for details.
- */
- class EncodingAwareStringBean extends StringBean
- {
- private static final long serialVersionUID = -9033414360428669553L;
-
- /**
- * Sets the File to extract strings from, and the encoding
- * it's in (if known to Alfresco)
- *
- * @param file The File that text should be fetched from.
- * @param encoding The encoding of the input
- */
- public void setURL(File file, String encoding)
- {
- String previousURL = getURL();
- String newURL = file.getAbsolutePath();
-
- if ( (previousURL == null) || (!newURL.equals(previousURL)) )
- {
- try
- {
- URLConnection conn = getConnection();
-
- if (null == mParser)
- {
- mParser = new Parser(newURL);
- }
- else
- {
- mParser.setURL(newURL);
- }
-
- if (encoding != null)
- {
- mParser.setEncoding(encoding);
- }
-
- mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
- mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
- setStrings();
- }
- catch (ParserException pe)
- {
- updateStrings(pe.toString());
- }
- }
- }
- }
}
diff --git a/source/test-java/org/alfresco/repo/content/transform/EMLTransformerTest.java b/source/test-java/org/alfresco/repo/content/transform/EMLTransformerTest.java
index 04803b1b08..742618336b 100644
--- a/source/test-java/org/alfresco/repo/content/transform/EMLTransformerTest.java
+++ b/source/test-java/org/alfresco/repo/content/transform/EMLTransformerTest.java
@@ -44,7 +44,9 @@ public class EMLTransformerTest extends AbstractContentTransformerTest
private static final String QUICK_EML_ATTACHMENT_CONTENT = "File attachment content";
- private static final String QUICK_EML_ALTERNATIVE_CONTENT = "alternative html text";
+ private static final String QUICK_EML_ALTERNATIVE_CONTENT = "alternative plain text";
+
+ private static final String HTML_SPACE_SPECIAL_CHAR = " ";
private EMLTransformer transformer;
@@ -113,7 +115,7 @@ public class EMLTransformerTest extends AbstractContentTransformerTest
}
/**
- * Test transforming a valid eml with an attachment to text; attachment should be ingnored
+ * Test transforming a valid eml with an attachment to text; attachment should be ignored
*/
public void testRFC822WithAttachmentToText() throws Exception
{
@@ -152,4 +154,24 @@ public class EMLTransformerTest extends AbstractContentTransformerTest
String contentStr = reader2.getContentString();
assertTrue(contentStr.contains(QUICK_EML_ALTERNATIVE_CONTENT));
}
+
+ /**
+ * Test transforming a valid eml with a html part containing html special characters to text
+ */
+ public void testHtmlSpecialCharsToText() throws Exception
+ {
+ File emlSourceFile = loadQuickTestFile("htmlChars.eml");
+ File txtTargetFile = TempFileProvider.createTempFile("test5", ".txt");
+ ContentReader reader = new FileContentReader(emlSourceFile);
+ reader.setMimetype(MimetypeMap.MIMETYPE_RFC822);
+ ContentWriter writer = new FileContentWriter(txtTargetFile);
+ writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
+
+ transformer.transform(reader, writer);
+
+ ContentReader reader2 = new FileContentReader(txtTargetFile);
+ reader2.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
+ String contentStr = reader2.getContentString();
+ assertTrue(!contentStr.contains(HTML_SPACE_SPECIAL_CHAR));
+ }
}