diff --git a/config/alfresco/swf-transform-context.xml b/config/alfresco/swf-transform-context.xml index 7974682e35..47557ca148 100644 --- a/config/alfresco/swf-transform-context.xml +++ b/config/alfresco/swf-transform-context.xml @@ -198,5 +198,24 @@ + + + + + + + + + + + + + text/plain + application/pdf + + + diff --git a/source/java/org/alfresco/repo/content/transform/EMLTransformer.java b/source/java/org/alfresco/repo/content/transform/EMLTransformer.java index d10ae47912..8866123dd8 100644 --- a/source/java/org/alfresco/repo/content/transform/EMLTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/EMLTransformer.java @@ -16,10 +16,11 @@ * You should have received a copy of the GNU Lesser General Public License * along with Alfresco. If not, see . */ + package org.alfresco.repo.content.transform; import java.io.IOException; -import java.io.InputStream; +import java.nio.charset.Charset; import java.util.Properties; import javax.mail.MessagingException; @@ -32,24 +33,25 @@ import org.alfresco.repo.content.MimetypeMap; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.TransformationOptions; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.txt.Icu4jEncodingDetector; /** - * Uses javax.mail.MimeMessage to generate plain text versions of - * RFC822 email messages. - * Searches for all text content parts, and returns them. Any - * attachments are ignored. - * - * TIKA Note - could be replaced with the Tika email parser. Would - * require a recursing parser to be specified, but not the full - * Auto one (we don't want attachments), just one containing - * text and html related parsers. + * Uses javax.mail.MimeMessage to generate plain text versions of RFC822 email + * messages. Searches for all text content parts, and returns them. Any + * attachments are ignored. TIKA Note - could be replaced with the Tika email + * parser. Would require a recursing parser to be specified, but not the full + * Auto one (we don't want attachments), just one containing text and html + * related parsers. */ public class EMLTransformer extends AbstractContentTransformer2 { @Override public boolean isTransformableMimetype(String sourceMimetype, String targetMimetype, TransformationOptions options) { - if (!MimetypeMap.MIMETYPE_RFC822.equals(sourceMimetype) || !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype)) + if (!MimetypeMap.MIMETYPE_RFC822.equals(sourceMimetype) + || !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype)) { // only support RFC822 -> TEXT return false; @@ -61,15 +63,24 @@ public class EMLTransformer extends AbstractContentTransformer2 } @Override - protected void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options) throws Exception + protected void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options) + throws Exception { - InputStream is = null; + TikaInputStream tikaInputStream = null; try { - is = reader.getContentInputStream(); + // wrap the given stream to a TikaInputStream instance + tikaInputStream = TikaInputStream.get(reader.getContentInputStream()); - MimeMessage mimeMessage = new MimeMessage(Session.getDefaultInstance(new Properties()), is); + final Icu4jEncodingDetector encodingDetector = new Icu4jEncodingDetector(); + final Charset charset = encodingDetector.detect(tikaInputStream, new Metadata()); + MimeMessage mimeMessage = new MimeMessage(Session.getDefaultInstance(new Properties()), tikaInputStream); + if (charset != null) + { + mimeMessage.setHeader("Content-Type", "text/plain; charset=" + charset.name()); + mimeMessage.setHeader("Content-Transfer-Encoding", "quoted-printable"); + } final StringBuilder sb = new StringBuilder(); Object content = mimeMessage.getContent(); if (content instanceof Multipart) @@ -80,16 +91,16 @@ public class EMLTransformer extends AbstractContentTransformer2 { sb.append(content.toString()); } - writer.putContent(sb.toString()); } finally { - if (is != null) + if (tikaInputStream != null) { try { - is.close(); + // it closes any other resources associated with it + tikaInputStream.close(); } catch (IOException e) { diff --git a/source/java/org/alfresco/repo/content/transform/EMLTransformerTest.java b/source/java/org/alfresco/repo/content/transform/EMLTransformerTest.java new file mode 100644 index 0000000000..2eafdb7b67 --- /dev/null +++ b/source/java/org/alfresco/repo/content/transform/EMLTransformerTest.java @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2005-2012 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ + +package org.alfresco.repo.content.transform; + +import java.io.File; + +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.repo.content.filestore.FileContentReader; +import org.alfresco.repo.content.filestore.FileContentWriter; +import org.alfresco.service.cmr.repository.ContentReader; +import org.alfresco.service.cmr.repository.ContentWriter; +import org.alfresco.service.cmr.repository.TransformationOptions; +import org.alfresco.util.TempFileProvider; + +/** + * @see org.alfresco.repo.content.transform.EMLTransformer + * + * @author Jamal Kaabi-Mofrad + */ +public class EMLTransformerTest extends AbstractContentTransformerTest +{ + private static final String QUICK_EML_CONTENT = "Gym class featuring a brown fox and lazy dog"; + + private static final String QUICK_EML_CONTENT_SPANISH_UNICODE = "El r\u00E1pido zorro marr\u00F3n salta sobre el perro perezoso"; + + private EMLTransformer transformer; + + @Override + public void setUp() throws Exception + { + super.setUp(); + + transformer = new EMLTransformer(); + transformer.setMimetypeService(mimetypeService); + transformer.setTransformerDebug(transformerDebug); + } + + @Override + protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype) + { + return transformer; + } + + public void testIsTransformable() throws Exception + { + assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, -1, MimetypeMap.MIMETYPE_RFC822, + new TransformationOptions())); + assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_RFC822, -1, MimetypeMap.MIMETYPE_TEXT_PLAIN, + new TransformationOptions())); + } + + /** + * Test transforming a valid eml file to text + */ + public void testRFC822ToText() throws Exception + { + File emlSourceFile = loadQuickTestFile("eml"); + File txtTargetFile = TempFileProvider.createTempFile("test", ".txt"); + ContentReader reader = new FileContentReader(emlSourceFile); + reader.setMimetype(MimetypeMap.MIMETYPE_RFC822); + ContentWriter writer = new FileContentWriter(txtTargetFile); + writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN); + + transformer.transform(reader, writer); + + ContentReader reader2 = new FileContentReader(txtTargetFile); + reader2.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN); + assertTrue(reader2.getContentString().contains(QUICK_EML_CONTENT)); + } + + /** + * Test transforming a non-ascii eml file to text + */ + public void testNonAsciiRFC822ToText() throws Exception + { + File emlSourceFile = loadQuickTestFile("spanish.eml"); + File txtTargetFile = TempFileProvider.createTempFile("test2", ".txt"); + ContentReader reader = new FileContentReader(emlSourceFile); + reader.setMimetype(MimetypeMap.MIMETYPE_RFC822); + ContentWriter writer = new FileContentWriter(txtTargetFile); + writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN); + + transformer.transform(reader, writer); + + ContentReader reader2 = new FileContentReader(txtTargetFile); + reader2.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN); + assertTrue(reader2.getContentString().contains(new String(QUICK_EML_CONTENT_SPANISH_UNICODE.getBytes("UTF-8")))); + } +} diff --git a/source/test-resources/quick/quick.eml b/source/test-resources/quick/quick.eml index 8a25c8e33e..934d80d884 100644 --- a/source/test-resources/quick/quick.eml +++ b/source/test-resources/quick/quick.eml @@ -6,3 +6,5 @@ Date: Fri, 4 Jun 2004 14:23:22 +0200 Subject: The quick brown fox jumps over the lazy dog Gym class featuring a brown fox and lazy dog + +The quick brown fox jumps over the lazy dog diff --git a/source/test-resources/quick/quick.spanish.eml b/source/test-resources/quick/quick.spanish.eml new file mode 100644 index 0000000000..0cc1a72707 --- /dev/null +++ b/source/test-resources/quick/quick.spanish.eml @@ -0,0 +1,12 @@ +MIME-Version: 1.0 +Received: by 10.000.0.000 with HTTP; Thu, 16 Aug 2012 08:13:29 -0700 (PDT) +Date: Thu, 16 Aug 2012 16:13:29 +0100 +Delivered-To: jane.doe@alfresco.com +Message-ID: +Subject: quick test +From: john.doe@alfresco.com> +To: jane.doe@alfresco.com> +Content-Type: multipart/alternative; + +El rápido zorro marrón salta sobre el perro perezoso +