mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-10-15 15:02:20 +00:00
Merged HEAD-BUG-FIX (5.1/Cloud) to HEAD (5.1/Cloud)
103441: Merged 5.0.N (5.0.2) to HEAD-BUG-FIX (5.1/Cloud) 103246: Merged V4.2-BUG-FIX (4.2.5) to 5.0.N (5.0.2) 103035: Merged NESS/4.2.N-2015_03_12 (4.2.5) to V4.2-BUG-FIX (4.2.5) 102240: MNT-13531: EMLTransformer ignoring multipart emails - used htmlparser to extract text from html mail part - added test to check if html special chars appear in transformation result 102375: MNT-13531: EMLTransformer ignoring multipart emails - use plain/text representation if present, prior to html representation on multipart/alternative parts git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@103625 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
28
config/quick/quick.htmlChars.eml
Normal file
28
config/quick/quick.htmlChars.eml
Normal file
@@ -0,0 +1,28 @@
|
||||
MIME-Version: 1.0
|
||||
Received: by 10.000.0.000 with HTTP; Thu, 16 Aug 2012 08:13:29 -0700 (PDT)
|
||||
Date: Thu, 16 Aug 2012 16:13:29 +0100
|
||||
Delivered-To: jane.doe@alfresco.com
|
||||
Message-ID: <CAL0uq1f9vPczLRinL3xB5U_oSSd5U0ob=408nBgosCY0OVFyBw@mail.alfresco.com>
|
||||
Subject: Attachment test
|
||||
From: <john.doe@alfresco.com>
|
||||
To: <jane.doe@alfresco.com>
|
||||
Content-Type: multipart/alternative;
|
||||
boundary="----=_NextPart_000_0005_01D06C6A.DBA98EC0"
|
||||
|
||||
This is a multipart message in MIME format.
|
||||
|
||||
------=_NextPart_000_0005_01D06C6A.DBA98EC0
|
||||
Content-Type: text/plain;
|
||||
charset="utf-8"
|
||||
Content-Transfer-Encoding: 7bit
|
||||
|
||||
html special characters
|
||||
|
||||
------=_NextPart_000_0005_01D06C6A.DBA98EC0
|
||||
Content-Type: text/html;
|
||||
charset="utf-8"
|
||||
Content-Transfer-Encoding: quoted-printable
|
||||
|
||||
<div dir=3D"ltr">html special characters</div>
|
||||
|
||||
------=_NextPart_000_0005_01D06C6A.DBA98EC0--
|
@@ -19,11 +19,11 @@
|
||||
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Properties;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import javax.mail.MessagingException;
|
||||
import javax.mail.Multipart;
|
||||
@@ -32,9 +32,11 @@ import javax.mail.Session;
|
||||
import javax.mail.internet.MimeMessage;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.repo.content.filestore.FileContentWriter;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
|
||||
|
||||
/**
|
||||
@@ -48,9 +50,8 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
public class EMLTransformer extends AbstractContentTransformer2
|
||||
|
||||
{
|
||||
private static final Pattern TAG_PATTERN = Pattern.compile("<[^>]*>");
|
||||
private static final String BR_TAG_PATTERN = "<[bB][rR].?\\/?>";
|
||||
private static final String NEW_LINE_PATTERN = "\n";
|
||||
private static final String CHARSET = "charset";
|
||||
private static final String DEFAULT_ENCODING = "UTF-8";
|
||||
|
||||
@Override
|
||||
public boolean isTransformableMimetype(String sourceMimetype, String targetMimetype, TransformationOptions options)
|
||||
@@ -144,7 +145,6 @@ public class EMLTransformer extends AbstractContentTransformer2
|
||||
|
||||
/**
|
||||
* Finds the suitable part from an multipart/alternative and appends it's text content to StringBuilder sb
|
||||
* Html parts have higher priority than text parts
|
||||
*
|
||||
* @param multipart
|
||||
* @param sb
|
||||
@@ -160,10 +160,10 @@ public class EMLTransformer extends AbstractContentTransformer2
|
||||
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
|
||||
{
|
||||
partToUse = part;
|
||||
break;
|
||||
}
|
||||
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML)){
|
||||
partToUse = part;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (partToUse != null)
|
||||
@@ -186,7 +186,7 @@ public class EMLTransformer extends AbstractContentTransformer2
|
||||
boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition());
|
||||
if (isAttachment)
|
||||
{
|
||||
return;
|
||||
return;
|
||||
}
|
||||
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
|
||||
{
|
||||
@@ -194,14 +194,37 @@ public class EMLTransformer extends AbstractContentTransformer2
|
||||
}
|
||||
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML))
|
||||
{
|
||||
String content = part.getContent().toString();
|
||||
//replace line breaks with new lines
|
||||
content = content.replaceAll(BR_TAG_PATTERN, NEW_LINE_PATTERN);
|
||||
Matcher tagMatcher = TAG_PATTERN.matcher(content);
|
||||
//remove html tags
|
||||
content = tagMatcher.replaceAll("");
|
||||
sb.append(content);
|
||||
String mailPartContent = part.getContent().toString();
|
||||
|
||||
//create a temporary html file with same mail part content and encoding
|
||||
File tempHtmlFile = TempFileProvider.createTempFile("EMLTransformer_", ".html");
|
||||
ContentWriter contentWriter = new FileContentWriter(tempHtmlFile);
|
||||
contentWriter.setEncoding(getMailPartContentEncoding(part));
|
||||
contentWriter.setMimetype(MimetypeMap.MIMETYPE_HTML);
|
||||
contentWriter.putContent(mailPartContent);
|
||||
|
||||
//transform html file's content to plain text
|
||||
EncodingAwareStringBean extractor = new EncodingAwareStringBean();
|
||||
extractor.setCollapse(false);
|
||||
extractor.setLinks(false);
|
||||
extractor.setReplaceNonBreakingSpaces(false);
|
||||
extractor.setURL(tempHtmlFile, contentWriter.getEncoding());
|
||||
sb.append(extractor.getStrings());
|
||||
|
||||
tempHtmlFile.delete();
|
||||
}
|
||||
}
|
||||
|
||||
private String getMailPartContentEncoding(Part part) throws MessagingException
|
||||
{
|
||||
String encoding = DEFAULT_ENCODING;
|
||||
String contentType = part.getContentType();
|
||||
int startIndex = contentType.indexOf(CHARSET);
|
||||
if (startIndex > 0)
|
||||
{
|
||||
encoding = contentType.substring(startIndex + CHARSET.length() + 1).replaceAll("\"", "");
|
||||
}
|
||||
return encoding;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -0,0 +1,69 @@
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.URLConnection;
|
||||
|
||||
import org.htmlparser.Parser;
|
||||
import org.htmlparser.beans.StringBean;
|
||||
import org.htmlparser.util.ParserException;
|
||||
|
||||
/**
|
||||
* A version of {@link StringBean} which allows control of the
|
||||
* encoding in the underlying HTML Parser.
|
||||
* Unfortunately, StringBean doesn't allow easy over-riding of
|
||||
* this, so we have to duplicate some code to control this.
|
||||
* This allows us to correctly handle HTML files where the encoding
|
||||
* is specified against the content property (rather than in the
|
||||
* HTML Head Meta), see ALF-10466 for details.
|
||||
*/
|
||||
class EncodingAwareStringBean extends StringBean
|
||||
{
|
||||
private static final long serialVersionUID = -9033414360428669553L;
|
||||
|
||||
/**
|
||||
* Sets the File to extract strings from, and the encoding
|
||||
* it's in (if known to Alfresco)
|
||||
*
|
||||
* @param file The File that text should be fetched from.
|
||||
* @param encoding The encoding of the input
|
||||
*/
|
||||
public void setURL(File file, String encoding)
|
||||
{
|
||||
String previousURL = getURL();
|
||||
String newURL = file.getAbsolutePath();
|
||||
|
||||
if ( (previousURL == null) || (!newURL.equals(previousURL)) )
|
||||
{
|
||||
try
|
||||
{
|
||||
URLConnection conn = getConnection();
|
||||
|
||||
if (null == mParser)
|
||||
{
|
||||
mParser = new Parser(newURL);
|
||||
}
|
||||
else
|
||||
{
|
||||
mParser.setURL(newURL);
|
||||
}
|
||||
|
||||
if (encoding != null)
|
||||
{
|
||||
mParser.setEncoding(encoding);
|
||||
}
|
||||
|
||||
mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
|
||||
mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
|
||||
setStrings();
|
||||
}
|
||||
catch (ParserException pe)
|
||||
{
|
||||
updateStrings(pe.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getEncoding(){
|
||||
return mParser.getEncoding();
|
||||
}
|
||||
}
|
@@ -19,8 +19,6 @@
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.URLConnection;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
@@ -29,9 +27,7 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.htmlparser.Parser;
|
||||
import org.htmlparser.beans.StringBean;
|
||||
import org.htmlparser.util.ParserException;
|
||||
|
||||
|
||||
/**
|
||||
* Content transformer which wraps the HTML Parser library for
|
||||
@@ -110,61 +106,4 @@ public class HtmlParserContentTransformer extends AbstractContentTransformer2
|
||||
// Tidy up
|
||||
htmlFile.delete();
|
||||
}
|
||||
|
||||
/**
|
||||
* A version of {@link StringBean} which allows control of the
|
||||
* encoding in the underlying HTML Parser.
|
||||
* Unfortunately, StringBean doesn't allow easy over-riding of
|
||||
* this, so we have to duplicate some code to control this.
|
||||
* This allows us to correctly handle HTML files where the encoding
|
||||
* is specified against the content property (rather than in the
|
||||
* HTML Head Meta), see ALF-10466 for details.
|
||||
*/
|
||||
class EncodingAwareStringBean extends StringBean
|
||||
{
|
||||
private static final long serialVersionUID = -9033414360428669553L;
|
||||
|
||||
/**
|
||||
* Sets the File to extract strings from, and the encoding
|
||||
* it's in (if known to Alfresco)
|
||||
*
|
||||
* @param file The File that text should be fetched from.
|
||||
* @param encoding The encoding of the input
|
||||
*/
|
||||
public void setURL(File file, String encoding)
|
||||
{
|
||||
String previousURL = getURL();
|
||||
String newURL = file.getAbsolutePath();
|
||||
|
||||
if ( (previousURL == null) || (!newURL.equals(previousURL)) )
|
||||
{
|
||||
try
|
||||
{
|
||||
URLConnection conn = getConnection();
|
||||
|
||||
if (null == mParser)
|
||||
{
|
||||
mParser = new Parser(newURL);
|
||||
}
|
||||
else
|
||||
{
|
||||
mParser.setURL(newURL);
|
||||
}
|
||||
|
||||
if (encoding != null)
|
||||
{
|
||||
mParser.setEncoding(encoding);
|
||||
}
|
||||
|
||||
mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
|
||||
mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
|
||||
setStrings();
|
||||
}
|
||||
catch (ParserException pe)
|
||||
{
|
||||
updateStrings(pe.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -44,7 +44,9 @@ public class EMLTransformerTest extends AbstractContentTransformerTest
|
||||
|
||||
private static final String QUICK_EML_ATTACHMENT_CONTENT = "File attachment content";
|
||||
|
||||
private static final String QUICK_EML_ALTERNATIVE_CONTENT = "alternative html text";
|
||||
private static final String QUICK_EML_ALTERNATIVE_CONTENT = "alternative plain text";
|
||||
|
||||
private static final String HTML_SPACE_SPECIAL_CHAR = " ";
|
||||
|
||||
private EMLTransformer transformer;
|
||||
|
||||
@@ -113,7 +115,7 @@ public class EMLTransformerTest extends AbstractContentTransformerTest
|
||||
}
|
||||
|
||||
/**
|
||||
* Test transforming a valid eml with an attachment to text; attachment should be ingnored
|
||||
* Test transforming a valid eml with an attachment to text; attachment should be ignored
|
||||
*/
|
||||
public void testRFC822WithAttachmentToText() throws Exception
|
||||
{
|
||||
@@ -152,4 +154,24 @@ public class EMLTransformerTest extends AbstractContentTransformerTest
|
||||
String contentStr = reader2.getContentString();
|
||||
assertTrue(contentStr.contains(QUICK_EML_ALTERNATIVE_CONTENT));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test transforming a valid eml with a html part containing html special characters to text
|
||||
*/
|
||||
public void testHtmlSpecialCharsToText() throws Exception
|
||||
{
|
||||
File emlSourceFile = loadQuickTestFile("htmlChars.eml");
|
||||
File txtTargetFile = TempFileProvider.createTempFile("test5", ".txt");
|
||||
ContentReader reader = new FileContentReader(emlSourceFile);
|
||||
reader.setMimetype(MimetypeMap.MIMETYPE_RFC822);
|
||||
ContentWriter writer = new FileContentWriter(txtTargetFile);
|
||||
writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
|
||||
transformer.transform(reader, writer);
|
||||
|
||||
ContentReader reader2 = new FileContentReader(txtTargetFile);
|
||||
reader2.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
String contentStr = reader2.getContentString();
|
||||
assertTrue(!contentStr.contains(HTML_SPACE_SPECIAL_CHAR));
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user