Merged HEAD-BUG-FIX (5.1/Cloud) to HEAD (5.1/Cloud)

103441: Merged 5.0.N (5.0.2) to HEAD-BUG-FIX (5.1/Cloud)
      103246: Merged V4.2-BUG-FIX (4.2.5) to 5.0.N (5.0.2)
         103035: Merged NESS/4.2.N-2015_03_12 (4.2.5) to V4.2-BUG-FIX (4.2.5)
            102240: MNT-13531: EMLTransformer ignoring multipart emails
               - used htmlparser to extract text from html mail part
               - added test to check if html special chars appear in transformation result
            102375: MNT-13531: EMLTransformer ignoring multipart emails
               - use plain/text representation if present, prior to html representation on multipart/alternative parts


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@103625 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Alan Davis
2015-05-02 07:44:41 +00:00
parent c9d5efcef6
commit fba0e07a62
5 changed files with 160 additions and 79 deletions

View File

@@ -19,11 +19,11 @@
package org.alfresco.repo.content.transform;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.mail.MessagingException;
import javax.mail.Multipart;
@@ -32,9 +32,11 @@ import javax.mail.Session;
import javax.mail.internet.MimeMessage;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.filestore.FileContentWriter;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.alfresco.util.TempFileProvider;
/**
@@ -48,9 +50,8 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
public class EMLTransformer extends AbstractContentTransformer2
{
private static final Pattern TAG_PATTERN = Pattern.compile("<[^>]*>");
private static final String BR_TAG_PATTERN = "<[bB][rR].?\\/?>";
private static final String NEW_LINE_PATTERN = "\n";
private static final String CHARSET = "charset";
private static final String DEFAULT_ENCODING = "UTF-8";
@Override
public boolean isTransformableMimetype(String sourceMimetype, String targetMimetype, TransformationOptions options)
@@ -144,7 +145,6 @@ public class EMLTransformer extends AbstractContentTransformer2
/**
* Finds the suitable part from an multipart/alternative and appends it's text content to StringBuilder sb
* Html parts have higher priority than text parts
*
* @param multipart
* @param sb
@@ -160,10 +160,10 @@ public class EMLTransformer extends AbstractContentTransformer2
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
{
partToUse = part;
break;
}
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML)){
partToUse = part;
break;
}
}
if (partToUse != null)
@@ -186,7 +186,7 @@ public class EMLTransformer extends AbstractContentTransformer2
boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition());
if (isAttachment)
{
return;
return;
}
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
{
@@ -194,14 +194,37 @@ public class EMLTransformer extends AbstractContentTransformer2
}
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML))
{
String content = part.getContent().toString();
//replace line breaks with new lines
content = content.replaceAll(BR_TAG_PATTERN, NEW_LINE_PATTERN);
Matcher tagMatcher = TAG_PATTERN.matcher(content);
//remove html tags
content = tagMatcher.replaceAll("");
sb.append(content);
String mailPartContent = part.getContent().toString();
//create a temporary html file with same mail part content and encoding
File tempHtmlFile = TempFileProvider.createTempFile("EMLTransformer_", ".html");
ContentWriter contentWriter = new FileContentWriter(tempHtmlFile);
contentWriter.setEncoding(getMailPartContentEncoding(part));
contentWriter.setMimetype(MimetypeMap.MIMETYPE_HTML);
contentWriter.putContent(mailPartContent);
//transform html file's content to plain text
EncodingAwareStringBean extractor = new EncodingAwareStringBean();
extractor.setCollapse(false);
extractor.setLinks(false);
extractor.setReplaceNonBreakingSpaces(false);
extractor.setURL(tempHtmlFile, contentWriter.getEncoding());
sb.append(extractor.getStrings());
tempHtmlFile.delete();
}
}
private String getMailPartContentEncoding(Part part) throws MessagingException
{
String encoding = DEFAULT_ENCODING;
String contentType = part.getContentType();
int startIndex = contentType.indexOf(CHARSET);
if (startIndex > 0)
{
encoding = contentType.substring(startIndex + CHARSET.length() + 1).replaceAll("\"", "");
}
return encoding;
}
}

View File

@@ -0,0 +1,69 @@
package org.alfresco.repo.content.transform;
import java.io.File;
import java.net.URLConnection;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
/**
* A version of {@link StringBean} which allows control of the
* encoding in the underlying HTML Parser.
* Unfortunately, StringBean doesn't allow easy over-riding of
* this, so we have to duplicate some code to control this.
* This allows us to correctly handle HTML files where the encoding
* is specified against the content property (rather than in the
* HTML Head Meta), see ALF-10466 for details.
*/
class EncodingAwareStringBean extends StringBean
{
private static final long serialVersionUID = -9033414360428669553L;
/**
* Sets the File to extract strings from, and the encoding
* it's in (if known to Alfresco)
*
* @param file The File that text should be fetched from.
* @param encoding The encoding of the input
*/
public void setURL(File file, String encoding)
{
String previousURL = getURL();
String newURL = file.getAbsolutePath();
if ( (previousURL == null) || (!newURL.equals(previousURL)) )
{
try
{
URLConnection conn = getConnection();
if (null == mParser)
{
mParser = new Parser(newURL);
}
else
{
mParser.setURL(newURL);
}
if (encoding != null)
{
mParser.setEncoding(encoding);
}
mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
setStrings();
}
catch (ParserException pe)
{
updateStrings(pe.toString());
}
}
}
public String getEncoding(){
return mParser.getEncoding();
}
}

View File

@@ -19,8 +19,6 @@
package org.alfresco.repo.content.transform;
import java.io.File;
import java.net.URLConnection;
import java.util.Arrays;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
@@ -29,9 +27,7 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
import org.alfresco.util.TempFileProvider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
/**
* Content transformer which wraps the HTML Parser library for
@@ -110,61 +106,4 @@ public class HtmlParserContentTransformer extends AbstractContentTransformer2
// Tidy up
htmlFile.delete();
}
/**
* A version of {@link StringBean} which allows control of the
* encoding in the underlying HTML Parser.
* Unfortunately, StringBean doesn't allow easy over-riding of
* this, so we have to duplicate some code to control this.
* This allows us to correctly handle HTML files where the encoding
* is specified against the content property (rather than in the
* HTML Head Meta), see ALF-10466 for details.
*/
class EncodingAwareStringBean extends StringBean
{
private static final long serialVersionUID = -9033414360428669553L;
/**
* Sets the File to extract strings from, and the encoding
* it's in (if known to Alfresco)
*
* @param file The File that text should be fetched from.
* @param encoding The encoding of the input
*/
public void setURL(File file, String encoding)
{
String previousURL = getURL();
String newURL = file.getAbsolutePath();
if ( (previousURL == null) || (!newURL.equals(previousURL)) )
{
try
{
URLConnection conn = getConnection();
if (null == mParser)
{
mParser = new Parser(newURL);
}
else
{
mParser.setURL(newURL);
}
if (encoding != null)
{
mParser.setEncoding(encoding);
}
mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
setStrings();
}
catch (ParserException pe)
{
updateStrings(pe.toString());
}
}
}
}
}