mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-10-15 15:02:20 +00:00
Merged HEAD-BUG-FIX (5.1/Cloud) to HEAD (5.1/Cloud)
103441: Merged 5.0.N (5.0.2) to HEAD-BUG-FIX (5.1/Cloud) 103246: Merged V4.2-BUG-FIX (4.2.5) to 5.0.N (5.0.2) 103035: Merged NESS/4.2.N-2015_03_12 (4.2.5) to V4.2-BUG-FIX (4.2.5) 102240: MNT-13531: EMLTransformer ignoring multipart emails - used htmlparser to extract text from html mail part - added test to check if html special chars appear in transformation result 102375: MNT-13531: EMLTransformer ignoring multipart emails - use plain/text representation if present, prior to html representation on multipart/alternative parts git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@103625 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
28
config/quick/quick.htmlChars.eml
Normal file
28
config/quick/quick.htmlChars.eml
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
MIME-Version: 1.0
|
||||||
|
Received: by 10.000.0.000 with HTTP; Thu, 16 Aug 2012 08:13:29 -0700 (PDT)
|
||||||
|
Date: Thu, 16 Aug 2012 16:13:29 +0100
|
||||||
|
Delivered-To: jane.doe@alfresco.com
|
||||||
|
Message-ID: <CAL0uq1f9vPczLRinL3xB5U_oSSd5U0ob=408nBgosCY0OVFyBw@mail.alfresco.com>
|
||||||
|
Subject: Attachment test
|
||||||
|
From: <john.doe@alfresco.com>
|
||||||
|
To: <jane.doe@alfresco.com>
|
||||||
|
Content-Type: multipart/alternative;
|
||||||
|
boundary="----=_NextPart_000_0005_01D06C6A.DBA98EC0"
|
||||||
|
|
||||||
|
This is a multipart message in MIME format.
|
||||||
|
|
||||||
|
------=_NextPart_000_0005_01D06C6A.DBA98EC0
|
||||||
|
Content-Type: text/plain;
|
||||||
|
charset="utf-8"
|
||||||
|
Content-Transfer-Encoding: 7bit
|
||||||
|
|
||||||
|
html special characters
|
||||||
|
|
||||||
|
------=_NextPart_000_0005_01D06C6A.DBA98EC0
|
||||||
|
Content-Type: text/html;
|
||||||
|
charset="utf-8"
|
||||||
|
Content-Transfer-Encoding: quoted-printable
|
||||||
|
|
||||||
|
<div dir=3D"ltr">html special characters</div>
|
||||||
|
|
||||||
|
------=_NextPart_000_0005_01D06C6A.DBA98EC0--
|
@@ -19,11 +19,11 @@
|
|||||||
|
|
||||||
package org.alfresco.repo.content.transform;
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import javax.mail.MessagingException;
|
import javax.mail.MessagingException;
|
||||||
import javax.mail.Multipart;
|
import javax.mail.Multipart;
|
||||||
@@ -32,9 +32,11 @@ import javax.mail.Session;
|
|||||||
import javax.mail.internet.MimeMessage;
|
import javax.mail.internet.MimeMessage;
|
||||||
|
|
||||||
import org.alfresco.repo.content.MimetypeMap;
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
|
import org.alfresco.repo.content.filestore.FileContentWriter;
|
||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||||
|
import org.alfresco.util.TempFileProvider;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -48,9 +50,8 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
|
|||||||
public class EMLTransformer extends AbstractContentTransformer2
|
public class EMLTransformer extends AbstractContentTransformer2
|
||||||
|
|
||||||
{
|
{
|
||||||
private static final Pattern TAG_PATTERN = Pattern.compile("<[^>]*>");
|
private static final String CHARSET = "charset";
|
||||||
private static final String BR_TAG_PATTERN = "<[bB][rR].?\\/?>";
|
private static final String DEFAULT_ENCODING = "UTF-8";
|
||||||
private static final String NEW_LINE_PATTERN = "\n";
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isTransformableMimetype(String sourceMimetype, String targetMimetype, TransformationOptions options)
|
public boolean isTransformableMimetype(String sourceMimetype, String targetMimetype, TransformationOptions options)
|
||||||
@@ -144,7 +145,6 @@ public class EMLTransformer extends AbstractContentTransformer2
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Finds the suitable part from an multipart/alternative and appends it's text content to StringBuilder sb
|
* Finds the suitable part from an multipart/alternative and appends it's text content to StringBuilder sb
|
||||||
* Html parts have higher priority than text parts
|
|
||||||
*
|
*
|
||||||
* @param multipart
|
* @param multipart
|
||||||
* @param sb
|
* @param sb
|
||||||
@@ -160,10 +160,10 @@ public class EMLTransformer extends AbstractContentTransformer2
|
|||||||
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
|
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
|
||||||
{
|
{
|
||||||
partToUse = part;
|
partToUse = part;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML)){
|
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML)){
|
||||||
partToUse = part;
|
partToUse = part;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (partToUse != null)
|
if (partToUse != null)
|
||||||
@@ -186,7 +186,7 @@ public class EMLTransformer extends AbstractContentTransformer2
|
|||||||
boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition());
|
boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition());
|
||||||
if (isAttachment)
|
if (isAttachment)
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
|
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
|
||||||
{
|
{
|
||||||
@@ -194,14 +194,37 @@ public class EMLTransformer extends AbstractContentTransformer2
|
|||||||
}
|
}
|
||||||
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML))
|
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML))
|
||||||
{
|
{
|
||||||
String content = part.getContent().toString();
|
String mailPartContent = part.getContent().toString();
|
||||||
//replace line breaks with new lines
|
|
||||||
content = content.replaceAll(BR_TAG_PATTERN, NEW_LINE_PATTERN);
|
//create a temporary html file with same mail part content and encoding
|
||||||
Matcher tagMatcher = TAG_PATTERN.matcher(content);
|
File tempHtmlFile = TempFileProvider.createTempFile("EMLTransformer_", ".html");
|
||||||
//remove html tags
|
ContentWriter contentWriter = new FileContentWriter(tempHtmlFile);
|
||||||
content = tagMatcher.replaceAll("");
|
contentWriter.setEncoding(getMailPartContentEncoding(part));
|
||||||
sb.append(content);
|
contentWriter.setMimetype(MimetypeMap.MIMETYPE_HTML);
|
||||||
|
contentWriter.putContent(mailPartContent);
|
||||||
|
|
||||||
|
//transform html file's content to plain text
|
||||||
|
EncodingAwareStringBean extractor = new EncodingAwareStringBean();
|
||||||
|
extractor.setCollapse(false);
|
||||||
|
extractor.setLinks(false);
|
||||||
|
extractor.setReplaceNonBreakingSpaces(false);
|
||||||
|
extractor.setURL(tempHtmlFile, contentWriter.getEncoding());
|
||||||
|
sb.append(extractor.getStrings());
|
||||||
|
|
||||||
|
tempHtmlFile.delete();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getMailPartContentEncoding(Part part) throws MessagingException
|
||||||
|
{
|
||||||
|
String encoding = DEFAULT_ENCODING;
|
||||||
|
String contentType = part.getContentType();
|
||||||
|
int startIndex = contentType.indexOf(CHARSET);
|
||||||
|
if (startIndex > 0)
|
||||||
|
{
|
||||||
|
encoding = contentType.substring(startIndex + CHARSET.length() + 1).replaceAll("\"", "");
|
||||||
|
}
|
||||||
|
return encoding;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,69 @@
|
|||||||
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.net.URLConnection;
|
||||||
|
|
||||||
|
import org.htmlparser.Parser;
|
||||||
|
import org.htmlparser.beans.StringBean;
|
||||||
|
import org.htmlparser.util.ParserException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A version of {@link StringBean} which allows control of the
|
||||||
|
* encoding in the underlying HTML Parser.
|
||||||
|
* Unfortunately, StringBean doesn't allow easy over-riding of
|
||||||
|
* this, so we have to duplicate some code to control this.
|
||||||
|
* This allows us to correctly handle HTML files where the encoding
|
||||||
|
* is specified against the content property (rather than in the
|
||||||
|
* HTML Head Meta), see ALF-10466 for details.
|
||||||
|
*/
|
||||||
|
class EncodingAwareStringBean extends StringBean
|
||||||
|
{
|
||||||
|
private static final long serialVersionUID = -9033414360428669553L;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the File to extract strings from, and the encoding
|
||||||
|
* it's in (if known to Alfresco)
|
||||||
|
*
|
||||||
|
* @param file The File that text should be fetched from.
|
||||||
|
* @param encoding The encoding of the input
|
||||||
|
*/
|
||||||
|
public void setURL(File file, String encoding)
|
||||||
|
{
|
||||||
|
String previousURL = getURL();
|
||||||
|
String newURL = file.getAbsolutePath();
|
||||||
|
|
||||||
|
if ( (previousURL == null) || (!newURL.equals(previousURL)) )
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
URLConnection conn = getConnection();
|
||||||
|
|
||||||
|
if (null == mParser)
|
||||||
|
{
|
||||||
|
mParser = new Parser(newURL);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
mParser.setURL(newURL);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (encoding != null)
|
||||||
|
{
|
||||||
|
mParser.setEncoding(encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
|
||||||
|
mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
|
||||||
|
setStrings();
|
||||||
|
}
|
||||||
|
catch (ParserException pe)
|
||||||
|
{
|
||||||
|
updateStrings(pe.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getEncoding(){
|
||||||
|
return mParser.getEncoding();
|
||||||
|
}
|
||||||
|
}
|
@@ -19,8 +19,6 @@
|
|||||||
package org.alfresco.repo.content.transform;
|
package org.alfresco.repo.content.transform;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.net.URLConnection;
|
|
||||||
import java.util.Arrays;
|
|
||||||
|
|
||||||
import org.alfresco.repo.content.MimetypeMap;
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
@@ -29,9 +27,7 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
|
|||||||
import org.alfresco.util.TempFileProvider;
|
import org.alfresco.util.TempFileProvider;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.htmlparser.Parser;
|
|
||||||
import org.htmlparser.beans.StringBean;
|
|
||||||
import org.htmlparser.util.ParserException;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Content transformer which wraps the HTML Parser library for
|
* Content transformer which wraps the HTML Parser library for
|
||||||
@@ -110,61 +106,4 @@ public class HtmlParserContentTransformer extends AbstractContentTransformer2
|
|||||||
// Tidy up
|
// Tidy up
|
||||||
htmlFile.delete();
|
htmlFile.delete();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* A version of {@link StringBean} which allows control of the
|
|
||||||
* encoding in the underlying HTML Parser.
|
|
||||||
* Unfortunately, StringBean doesn't allow easy over-riding of
|
|
||||||
* this, so we have to duplicate some code to control this.
|
|
||||||
* This allows us to correctly handle HTML files where the encoding
|
|
||||||
* is specified against the content property (rather than in the
|
|
||||||
* HTML Head Meta), see ALF-10466 for details.
|
|
||||||
*/
|
|
||||||
class EncodingAwareStringBean extends StringBean
|
|
||||||
{
|
|
||||||
private static final long serialVersionUID = -9033414360428669553L;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the File to extract strings from, and the encoding
|
|
||||||
* it's in (if known to Alfresco)
|
|
||||||
*
|
|
||||||
* @param file The File that text should be fetched from.
|
|
||||||
* @param encoding The encoding of the input
|
|
||||||
*/
|
|
||||||
public void setURL(File file, String encoding)
|
|
||||||
{
|
|
||||||
String previousURL = getURL();
|
|
||||||
String newURL = file.getAbsolutePath();
|
|
||||||
|
|
||||||
if ( (previousURL == null) || (!newURL.equals(previousURL)) )
|
|
||||||
{
|
|
||||||
try
|
|
||||||
{
|
|
||||||
URLConnection conn = getConnection();
|
|
||||||
|
|
||||||
if (null == mParser)
|
|
||||||
{
|
|
||||||
mParser = new Parser(newURL);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
mParser.setURL(newURL);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (encoding != null)
|
|
||||||
{
|
|
||||||
mParser.setEncoding(encoding);
|
|
||||||
}
|
|
||||||
|
|
||||||
mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
|
|
||||||
mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
|
|
||||||
setStrings();
|
|
||||||
}
|
|
||||||
catch (ParserException pe)
|
|
||||||
{
|
|
||||||
updateStrings(pe.toString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@@ -44,7 +44,9 @@ public class EMLTransformerTest extends AbstractContentTransformerTest
|
|||||||
|
|
||||||
private static final String QUICK_EML_ATTACHMENT_CONTENT = "File attachment content";
|
private static final String QUICK_EML_ATTACHMENT_CONTENT = "File attachment content";
|
||||||
|
|
||||||
private static final String QUICK_EML_ALTERNATIVE_CONTENT = "alternative html text";
|
private static final String QUICK_EML_ALTERNATIVE_CONTENT = "alternative plain text";
|
||||||
|
|
||||||
|
private static final String HTML_SPACE_SPECIAL_CHAR = " ";
|
||||||
|
|
||||||
private EMLTransformer transformer;
|
private EMLTransformer transformer;
|
||||||
|
|
||||||
@@ -113,7 +115,7 @@ public class EMLTransformerTest extends AbstractContentTransformerTest
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test transforming a valid eml with an attachment to text; attachment should be ingnored
|
* Test transforming a valid eml with an attachment to text; attachment should be ignored
|
||||||
*/
|
*/
|
||||||
public void testRFC822WithAttachmentToText() throws Exception
|
public void testRFC822WithAttachmentToText() throws Exception
|
||||||
{
|
{
|
||||||
@@ -152,4 +154,24 @@ public class EMLTransformerTest extends AbstractContentTransformerTest
|
|||||||
String contentStr = reader2.getContentString();
|
String contentStr = reader2.getContentString();
|
||||||
assertTrue(contentStr.contains(QUICK_EML_ALTERNATIVE_CONTENT));
|
assertTrue(contentStr.contains(QUICK_EML_ALTERNATIVE_CONTENT));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test transforming a valid eml with a html part containing html special characters to text
|
||||||
|
*/
|
||||||
|
public void testHtmlSpecialCharsToText() throws Exception
|
||||||
|
{
|
||||||
|
File emlSourceFile = loadQuickTestFile("htmlChars.eml");
|
||||||
|
File txtTargetFile = TempFileProvider.createTempFile("test5", ".txt");
|
||||||
|
ContentReader reader = new FileContentReader(emlSourceFile);
|
||||||
|
reader.setMimetype(MimetypeMap.MIMETYPE_RFC822);
|
||||||
|
ContentWriter writer = new FileContentWriter(txtTargetFile);
|
||||||
|
writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||||
|
|
||||||
|
transformer.transform(reader, writer);
|
||||||
|
|
||||||
|
ContentReader reader2 = new FileContentReader(txtTargetFile);
|
||||||
|
reader2.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||||
|
String contentStr = reader2.getContentString();
|
||||||
|
assertTrue(!contentStr.contains(HTML_SPACE_SPECIAL_CHAR));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user