mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-10-15 15:02:20 +00:00
Merged HEAD-BUG-FIX (5.1/Cloud) to HEAD (5.1/Cloud)
103441: Merged 5.0.N (5.0.2) to HEAD-BUG-FIX (5.1/Cloud) 103246: Merged V4.2-BUG-FIX (4.2.5) to 5.0.N (5.0.2) 103035: Merged NESS/4.2.N-2015_03_12 (4.2.5) to V4.2-BUG-FIX (4.2.5) 102240: MNT-13531: EMLTransformer ignoring multipart emails - used htmlparser to extract text from html mail part - added test to check if html special chars appear in transformation result 102375: MNT-13531: EMLTransformer ignoring multipart emails - use plain/text representation if present, prior to html representation on multipart/alternative parts git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@103625 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -19,11 +19,11 @@
|
||||
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Properties;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import javax.mail.MessagingException;
|
||||
import javax.mail.Multipart;
|
||||
@@ -32,9 +32,11 @@ import javax.mail.Session;
|
||||
import javax.mail.internet.MimeMessage;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.repo.content.filestore.FileContentWriter;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
|
||||
|
||||
/**
|
||||
@@ -48,9 +50,8 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
public class EMLTransformer extends AbstractContentTransformer2
|
||||
|
||||
{
|
||||
private static final Pattern TAG_PATTERN = Pattern.compile("<[^>]*>");
|
||||
private static final String BR_TAG_PATTERN = "<[bB][rR].?\\/?>";
|
||||
private static final String NEW_LINE_PATTERN = "\n";
|
||||
private static final String CHARSET = "charset";
|
||||
private static final String DEFAULT_ENCODING = "UTF-8";
|
||||
|
||||
@Override
|
||||
public boolean isTransformableMimetype(String sourceMimetype, String targetMimetype, TransformationOptions options)
|
||||
@@ -144,7 +145,6 @@ public class EMLTransformer extends AbstractContentTransformer2
|
||||
|
||||
/**
|
||||
* Finds the suitable part from an multipart/alternative and appends it's text content to StringBuilder sb
|
||||
* Html parts have higher priority than text parts
|
||||
*
|
||||
* @param multipart
|
||||
* @param sb
|
||||
@@ -160,10 +160,10 @@ public class EMLTransformer extends AbstractContentTransformer2
|
||||
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
|
||||
{
|
||||
partToUse = part;
|
||||
break;
|
||||
}
|
||||
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML)){
|
||||
partToUse = part;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (partToUse != null)
|
||||
@@ -186,7 +186,7 @@ public class EMLTransformer extends AbstractContentTransformer2
|
||||
boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition());
|
||||
if (isAttachment)
|
||||
{
|
||||
return;
|
||||
return;
|
||||
}
|
||||
if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
|
||||
{
|
||||
@@ -194,14 +194,37 @@ public class EMLTransformer extends AbstractContentTransformer2
|
||||
}
|
||||
else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML))
|
||||
{
|
||||
String content = part.getContent().toString();
|
||||
//replace line breaks with new lines
|
||||
content = content.replaceAll(BR_TAG_PATTERN, NEW_LINE_PATTERN);
|
||||
Matcher tagMatcher = TAG_PATTERN.matcher(content);
|
||||
//remove html tags
|
||||
content = tagMatcher.replaceAll("");
|
||||
sb.append(content);
|
||||
String mailPartContent = part.getContent().toString();
|
||||
|
||||
//create a temporary html file with same mail part content and encoding
|
||||
File tempHtmlFile = TempFileProvider.createTempFile("EMLTransformer_", ".html");
|
||||
ContentWriter contentWriter = new FileContentWriter(tempHtmlFile);
|
||||
contentWriter.setEncoding(getMailPartContentEncoding(part));
|
||||
contentWriter.setMimetype(MimetypeMap.MIMETYPE_HTML);
|
||||
contentWriter.putContent(mailPartContent);
|
||||
|
||||
//transform html file's content to plain text
|
||||
EncodingAwareStringBean extractor = new EncodingAwareStringBean();
|
||||
extractor.setCollapse(false);
|
||||
extractor.setLinks(false);
|
||||
extractor.setReplaceNonBreakingSpaces(false);
|
||||
extractor.setURL(tempHtmlFile, contentWriter.getEncoding());
|
||||
sb.append(extractor.getStrings());
|
||||
|
||||
tempHtmlFile.delete();
|
||||
}
|
||||
}
|
||||
|
||||
private String getMailPartContentEncoding(Part part) throws MessagingException
|
||||
{
|
||||
String encoding = DEFAULT_ENCODING;
|
||||
String contentType = part.getContentType();
|
||||
int startIndex = contentType.indexOf(CHARSET);
|
||||
if (startIndex > 0)
|
||||
{
|
||||
encoding = contentType.substring(startIndex + CHARSET.length() + 1).replaceAll("\"", "");
|
||||
}
|
||||
return encoding;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -0,0 +1,69 @@
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.URLConnection;
|
||||
|
||||
import org.htmlparser.Parser;
|
||||
import org.htmlparser.beans.StringBean;
|
||||
import org.htmlparser.util.ParserException;
|
||||
|
||||
/**
|
||||
* A version of {@link StringBean} which allows control of the
|
||||
* encoding in the underlying HTML Parser.
|
||||
* Unfortunately, StringBean doesn't allow easy over-riding of
|
||||
* this, so we have to duplicate some code to control this.
|
||||
* This allows us to correctly handle HTML files where the encoding
|
||||
* is specified against the content property (rather than in the
|
||||
* HTML Head Meta), see ALF-10466 for details.
|
||||
*/
|
||||
class EncodingAwareStringBean extends StringBean
|
||||
{
|
||||
private static final long serialVersionUID = -9033414360428669553L;
|
||||
|
||||
/**
|
||||
* Sets the File to extract strings from, and the encoding
|
||||
* it's in (if known to Alfresco)
|
||||
*
|
||||
* @param file The File that text should be fetched from.
|
||||
* @param encoding The encoding of the input
|
||||
*/
|
||||
public void setURL(File file, String encoding)
|
||||
{
|
||||
String previousURL = getURL();
|
||||
String newURL = file.getAbsolutePath();
|
||||
|
||||
if ( (previousURL == null) || (!newURL.equals(previousURL)) )
|
||||
{
|
||||
try
|
||||
{
|
||||
URLConnection conn = getConnection();
|
||||
|
||||
if (null == mParser)
|
||||
{
|
||||
mParser = new Parser(newURL);
|
||||
}
|
||||
else
|
||||
{
|
||||
mParser.setURL(newURL);
|
||||
}
|
||||
|
||||
if (encoding != null)
|
||||
{
|
||||
mParser.setEncoding(encoding);
|
||||
}
|
||||
|
||||
mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
|
||||
mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
|
||||
setStrings();
|
||||
}
|
||||
catch (ParserException pe)
|
||||
{
|
||||
updateStrings(pe.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getEncoding(){
|
||||
return mParser.getEncoding();
|
||||
}
|
||||
}
|
@@ -19,8 +19,6 @@
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.URLConnection;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
@@ -29,9 +27,7 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.htmlparser.Parser;
|
||||
import org.htmlparser.beans.StringBean;
|
||||
import org.htmlparser.util.ParserException;
|
||||
|
||||
|
||||
/**
|
||||
* Content transformer which wraps the HTML Parser library for
|
||||
@@ -110,61 +106,4 @@ public class HtmlParserContentTransformer extends AbstractContentTransformer2
|
||||
// Tidy up
|
||||
htmlFile.delete();
|
||||
}
|
||||
|
||||
/**
|
||||
* A version of {@link StringBean} which allows control of the
|
||||
* encoding in the underlying HTML Parser.
|
||||
* Unfortunately, StringBean doesn't allow easy over-riding of
|
||||
* this, so we have to duplicate some code to control this.
|
||||
* This allows us to correctly handle HTML files where the encoding
|
||||
* is specified against the content property (rather than in the
|
||||
* HTML Head Meta), see ALF-10466 for details.
|
||||
*/
|
||||
class EncodingAwareStringBean extends StringBean
|
||||
{
|
||||
private static final long serialVersionUID = -9033414360428669553L;
|
||||
|
||||
/**
|
||||
* Sets the File to extract strings from, and the encoding
|
||||
* it's in (if known to Alfresco)
|
||||
*
|
||||
* @param file The File that text should be fetched from.
|
||||
* @param encoding The encoding of the input
|
||||
*/
|
||||
public void setURL(File file, String encoding)
|
||||
{
|
||||
String previousURL = getURL();
|
||||
String newURL = file.getAbsolutePath();
|
||||
|
||||
if ( (previousURL == null) || (!newURL.equals(previousURL)) )
|
||||
{
|
||||
try
|
||||
{
|
||||
URLConnection conn = getConnection();
|
||||
|
||||
if (null == mParser)
|
||||
{
|
||||
mParser = new Parser(newURL);
|
||||
}
|
||||
else
|
||||
{
|
||||
mParser.setURL(newURL);
|
||||
}
|
||||
|
||||
if (encoding != null)
|
||||
{
|
||||
mParser.setEncoding(encoding);
|
||||
}
|
||||
|
||||
mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
|
||||
mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
|
||||
setStrings();
|
||||
}
|
||||
catch (ParserException pe)
|
||||
{
|
||||
updateStrings(pe.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user