Merged HEAD-BUG-FIX (5.1/Cloud) to HEAD (5.1/Cloud)

103441: Merged 5.0.N (5.0.2) to HEAD-BUG-FIX (5.1/Cloud) 103246: Merged V4.2-BUG-FIX (4.2.5) to 5.0.N (5.0.2) 103035: Merged NESS/4.2.N-2015_03_12 (4.2.5) to V4.2-BUG-FIX (4.2.5) 102240: MNT-13531: EMLTransformer ignoring multipart emails - used htmlparser to extract text from html mail part - added test to check if html special chars appear in transformation result 102375: MNT-13531: EMLTransformer ignoring multipart emails - use plain/text representation if present, prior to html representation on multipart/alternative parts git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@103625 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-10-15 15:02:20 +00:00 · 2015-05-02 07:44:41 +00:00
parent c9d5efcef6
commit fba0e07a62
5 changed files with 160 additions and 79 deletions
--- a/source/java/org/alfresco/repo/content/transform/EMLTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/EMLTransformer.java
@@ -19,11 +19,11 @@

 package org.alfresco.repo.content.transform;

+import java.io.File;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Properties;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;

 import javax.mail.MessagingException;
 import javax.mail.Multipart;
@@ -32,9 +32,11 @@ import javax.mail.Session;
 import javax.mail.internet.MimeMessage;

 import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.repo.content.filestore.FileContentWriter;
 import org.alfresco.service.cmr.repository.ContentReader;
 import org.alfresco.service.cmr.repository.ContentWriter;
 import org.alfresco.service.cmr.repository.TransformationOptions;
+import org.alfresco.util.TempFileProvider;


 /**
@@ -48,9 +50,8 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
 public class EMLTransformer extends AbstractContentTransformer2

 {
-    private static final Pattern TAG_PATTERN = Pattern.compile("<[^>]*>");
-    private static final String BR_TAG_PATTERN = "<[bB][rR].?\\/?>";
-    private static final String NEW_LINE_PATTERN = "\n";
+    private static final String CHARSET = "charset";
+    private static final String DEFAULT_ENCODING = "UTF-8";

    @Override
    public boolean isTransformableMimetype(String sourceMimetype, String targetMimetype, TransformationOptions options)
@@ -144,7 +145,6 @@ public class EMLTransformer extends AbstractContentTransformer2
    
    /**
     * Finds the suitable part from an multipart/alternative and appends it's text content to StringBuilder sb
-     * Html parts have higher priority than text parts
     * 
     * @param multipart
     * @param sb
@@ -160,10 +160,10 @@ public class EMLTransformer extends AbstractContentTransformer2
            if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
            {
                partToUse = part;
+                break;
            }
            else if  (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML)){
                partToUse = part;
-                break;
            }
        }
        if (partToUse != null)
@@ -186,7 +186,7 @@ public class EMLTransformer extends AbstractContentTransformer2
        boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition());
        if (isAttachment)
        {
-        	return;
+            return;
        }
        if (part.getContentType().contains(MimetypeMap.MIMETYPE_TEXT_PLAIN))
        {
@@ -194,14 +194,37 @@ public class EMLTransformer extends AbstractContentTransformer2
        }
        else if (part.getContentType().contains(MimetypeMap.MIMETYPE_HTML))
        {
-            String content = part.getContent().toString();
-            //replace line breaks with new lines
-            content = content.replaceAll(BR_TAG_PATTERN, NEW_LINE_PATTERN);
-            Matcher tagMatcher = TAG_PATTERN.matcher(content);
-            //remove html tags
-            content = tagMatcher.replaceAll("");
-            sb.append(content);
+            String mailPartContent = part.getContent().toString();
+            
+            //create a temporary html file with same mail part content and encoding
+            File tempHtmlFile = TempFileProvider.createTempFile("EMLTransformer_", ".html");
+            ContentWriter contentWriter = new FileContentWriter(tempHtmlFile);
+            contentWriter.setEncoding(getMailPartContentEncoding(part));
+            contentWriter.setMimetype(MimetypeMap.MIMETYPE_HTML);
+            contentWriter.putContent(mailPartContent);
+            
+            //transform html file's content to plain text
+            EncodingAwareStringBean extractor = new EncodingAwareStringBean();
+            extractor.setCollapse(false);
+            extractor.setLinks(false);
+            extractor.setReplaceNonBreakingSpaces(false);
+            extractor.setURL(tempHtmlFile, contentWriter.getEncoding());
+            sb.append(extractor.getStrings());
+            
+            tempHtmlFile.delete();
        }
    }
+    
+    private String getMailPartContentEncoding(Part part) throws MessagingException
+    {
+        String encoding = DEFAULT_ENCODING;
+        String contentType = part.getContentType();
+        int startIndex = contentType.indexOf(CHARSET);
+        if (startIndex > 0)
+        {
+            encoding = contentType.substring(startIndex + CHARSET.length() + 1).replaceAll("\"", "");
+        }
+        return encoding;
+    }

 }
--- a/source/java/org/alfresco/repo/content/transform/EncodingAwareStringBean.java
+++ b/source/java/org/alfresco/repo/content/transform/EncodingAwareStringBean.java
@@ -0,0 +1,69 @@
+package org.alfresco.repo.content.transform;
+
+import java.io.File;
+import java.net.URLConnection;
+
+import org.htmlparser.Parser;
+import org.htmlparser.beans.StringBean;
+import org.htmlparser.util.ParserException;
+
+/**
+ * A version of {@link StringBean} which allows control of the
+ *  encoding in the underlying HTML Parser.
+ * Unfortunately, StringBean doesn't allow easy over-riding of
+ *  this, so we have to duplicate some code to control this.
+ * This allows us to correctly handle HTML files where the encoding
+ *  is specified against the content property (rather than in the 
+ *  HTML Head Meta), see ALF-10466 for details.
+ */
+class EncodingAwareStringBean extends StringBean
+{
+    private static final long serialVersionUID = -9033414360428669553L;
+
+    /**
+     * Sets the File to extract strings from, and the encoding
+     *  it's in (if known to Alfresco)
+     *   
+     * @param file The File that text should be fetched from.
+     * @param encoding The encoding of the input
+     */
+    public void setURL(File file, String encoding)
+    {
+        String previousURL = getURL();
+        String newURL = file.getAbsolutePath();
+        
+        if ( (previousURL == null) || (!newURL.equals(previousURL)) )
+        {
+            try
+            {
+                URLConnection conn = getConnection();
+
+                if (null == mParser)
+                {
+                    mParser = new Parser(newURL);
+                }
+                else
+                {
+                    mParser.setURL(newURL);
+                }
+                
+                if (encoding != null)
+                {
+                    mParser.setEncoding(encoding);
+                }
+                
+                mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
+                mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
+                setStrings();
+            }
+            catch (ParserException pe)
+            {
+                updateStrings(pe.toString());
+            }
+        }
+    }
+    
+    public String getEncoding(){
+    	return mParser.getEncoding();
+    }
+}
--- a/source/java/org/alfresco/repo/content/transform/HtmlParserContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/HtmlParserContentTransformer.java
@@ -19,8 +19,6 @@
 package org.alfresco.repo.content.transform;

 import java.io.File;
-import java.net.URLConnection;
-import java.util.Arrays;

 import org.alfresco.repo.content.MimetypeMap;
 import org.alfresco.service.cmr.repository.ContentReader;
@@ -29,9 +27,7 @@ import org.alfresco.service.cmr.repository.TransformationOptions;
 import org.alfresco.util.TempFileProvider;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.htmlparser.Parser;
-import org.htmlparser.beans.StringBean;
-import org.htmlparser.util.ParserException;
+

 /**
 * Content transformer which wraps the HTML Parser library for 
@@ -110,61 +106,4 @@ public class HtmlParserContentTransformer extends AbstractContentTransformer2
        // Tidy up
        htmlFile.delete();
    }
-    
-    /**
-     * A version of {@link StringBean} which allows control of the
-     *  encoding in the underlying HTML Parser.
-     * Unfortunately, StringBean doesn't allow easy over-riding of
-     *  this, so we have to duplicate some code to control this.
-     * This allows us to correctly handle HTML files where the encoding
-     *  is specified against the content property (rather than in the 
-     *  HTML Head Meta), see ALF-10466 for details.
-     */
-    class EncodingAwareStringBean extends StringBean
-    {
-        private static final long serialVersionUID = -9033414360428669553L;
-
-        /**
-         * Sets the File to extract strings from, and the encoding
-         *  it's in (if known to Alfresco)
-         *   
-         * @param file The File that text should be fetched from.
-         * @param encoding The encoding of the input
-         */
-        public void setURL(File file, String encoding)
-        {
-            String previousURL = getURL();
-            String newURL = file.getAbsolutePath();
-            
-            if ( (previousURL == null) || (!newURL.equals(previousURL)) )
-            {
-                try
-                {
-                    URLConnection conn = getConnection();
-
-                    if (null == mParser)
-                    {
-                        mParser = new Parser(newURL);
-                    }
-                    else
-                    {
-                        mParser.setURL(newURL);
-                    }
-                    
-                    if (encoding != null)
-                    {
-                        mParser.setEncoding(encoding);
-                    }
-                    
-                    mPropertySupport.firePropertyChange(PROP_URL_PROPERTY, previousURL, getURL());
-                    mPropertySupport.firePropertyChange(PROP_CONNECTION_PROPERTY, conn, mParser.getConnection());
-                    setStrings();
-                }
-                catch (ParserException pe)
-                {
-                    updateStrings(pe.toString());
-                }
-            }
-        }
-    }
 }