Outlook email messages (in OLE2 .msg format) now converted to text for full-text indexing.

JUnit test for new transformer class.
Added new test to ContentTestSuite.

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@5951 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Kevin Roast 2007-06-14 10:51:51 +00:00
parent ede9345a5b
commit 04a78f17d2
11 changed files with 316 additions and 5 deletions

View File

@ -242,6 +242,10 @@
</list>
</property>
</bean>
<bean id="transformer.OutlookMsg"
class="org.alfresco.repo.content.transform.MailContentTransformer"
parent="baseContentTransformer" />
<!--
<bean id="transformer.JMagick" class="org.alfresco.repo.content.transform.magick.JMagickContentTransformer" init-method="init" />

View File

@ -41,6 +41,7 @@ import org.alfresco.repo.content.transform.ComplexContentTransformerTest;
import org.alfresco.repo.content.transform.ContentTransformerRegistryTest;
import org.alfresco.repo.content.transform.HtmlParserContentTransformerTest;
import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest;
import org.alfresco.repo.content.transform.MailContentTransformerTest;
import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest;
import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest;
import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest;
@ -83,6 +84,7 @@ public class ContentTestSuite extends TestSuite
suite.addTestSuite(RuntimeExecutableContentTransformerTest.class);
suite.addTestSuite(StringExtractingContentTransformerTest.class);
suite.addTestSuite(TextMiningContentTransformerTest.class);
suite.addTestSuite(MailContentTransformerTest.class);
suite.addTestSuite(ContentDataTest.class);
suite.addTestSuite(MimetypeMapTest.class);
suite.addTestSuite(RoutingContentServiceTest.class);

View File

@ -72,6 +72,7 @@ public class MimetypeMap implements MimetypeService
public static final String MIMETYPE_OPENSEARCH_DESCRIPTION = "application/opensearchdescription+xml";
public static final String MIMETYPE_ATOM = "application/atom+xml";
public static final String MIMETYPE_RSS = "application/rss+xml";
public static final String MIMETYPE_RFC822 = "message/rfc822";
// Open Document
public static final String MIMETYPE_OPENDOCUMENT_TEXT = "application/vnd.oasis.opendocument.text";
public static final String MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE = "application/vnd.oasis.opendocument.text-template";

View File

@ -716,7 +716,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
* strings are
* <ul>
* <li><b>Null:</b> Removed</li>
* <li><b>Emtpty String:</b> Passed to the {@link OverwritePolicy}</li>
* <li><b>Empty String:</b> Passed to the {@link OverwritePolicy}</li>
* <li><b>Non Serializable:</b> Converted to String or fails if that is not possible</li>
* </ul>
* <p>

View File

@ -102,11 +102,12 @@ public class MailMetadataExtracter extends AbstractMetadataExtracter
catch (IOException err)
{
// probably not an Outlook format MSG - ignore for now
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
if (logger.isWarnEnabled())
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
}
// store multi-value extracted property
if (receipientEmails.get().size() != 0)
if (this.receipientEmails.get().size() != 0)
{
destination.put(ContentModel.PROP_ADDRESSEES, (Serializable)receipientEmails.get());
}

View File

@ -56,7 +56,7 @@ import org.springframework.context.ApplicationContext;
*/
public abstract class AbstractContentTransformerTest extends TestCase
{
private static String QUICK_CONTENT = "The quick brown fox jumps over the lazy dog";
protected static String QUICK_CONTENT = "The quick brown fox jumps over the lazy dog";
private static String[] QUICK_WORDS = new String[] {
"quick", "brown", "fox", "jumps", "lazy", "dog"};

View File

@ -0,0 +1,196 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing
*/
package org.alfresco.repo.content.transform;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
/**
* Outlook email msg format to-text transformer.
*
* @author Kevin Roast
*/
public class MailContentTransformer extends AbstractContentTransformer
{
private static final Log logger = LogFactory.getLog(MailContentTransformer.class);
private static final String STREAM_PREFIX = "__substg1.0_";
private static final int STREAM_PREFIX_LENGTH = STREAM_PREFIX.length();
/**
* Only support MSG to text
*/
public double getReliability(String sourceMimetype, String targetMimetype)
{
if (!MimetypeMap.MIMETYPE_RFC822.equals(sourceMimetype) ||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
{
// only support MSG -> TEXT
return 0.0;
}
else
{
return 1.0;
}
}
/**
* @see org.alfresco.repo.content.transform.AbstractContentTransformer#transformInternal(org.alfresco.service.cmr.repository.ContentReader, org.alfresco.service.cmr.repository.ContentWriter, java.util.Map)
*/
@Override
protected void transformInternal(final ContentReader reader, final ContentWriter writer, Map<String, Object> options)
throws Exception
{
POIFSReaderListener readerListener = new POIFSReaderListener()
{
public void processPOIFSReaderEvent(final POIFSReaderEvent event)
{
try
{
if (event.getName().startsWith(STREAM_PREFIX))
{
StreamHandler handler = new StreamHandler(event.getName(), event.getStream());
String result = handler.process();
if (result != null)
{
writer.putContent(result);
}
}
}
catch (Exception ex)
{
throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
}
}
};
InputStream is = null;
try
{
is = reader.getContentInputStream();
POIFSReader poiFSReader = new POIFSReader();
poiFSReader.registerListener(readerListener);
try
{
poiFSReader.read(is);
}
catch (IOException err)
{
// probably not an Outlook format MSG - ignore for now
if (logger.isWarnEnabled())
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
}
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
}
}
private static final String ENCODING_TEXT = "001E";
private static final String ENCODING_BINARY = "0102";
private static final String ENCODING_UNICODE = "001F";
private static final String SUBSTG_MESSAGEBODY = "1000";
/**
* Class to handle stream types. Can process and extract specific streams.
*/
private class StreamHandler
{
StreamHandler(String name, DocumentInputStream stream)
{
this.type = name.substring(STREAM_PREFIX_LENGTH, STREAM_PREFIX_LENGTH + 4);
this.encoding = name.substring(STREAM_PREFIX_LENGTH + 4, STREAM_PREFIX_LENGTH + 8);
this.stream = stream;
}
String process()
throws IOException
{
String result = null;
if (SUBSTG_MESSAGEBODY.equals(this.type))
{
result = extractText(this.encoding);
}
return result;
}
/**
* Extract the text from the stream based on the encoding
*
* @return String
*
* @throws IOException
*/
private String extractText(String encoding)
throws IOException
{
byte[] data = new byte[this.stream.available()];
this.stream.read(data);
if (encoding.equals(ENCODING_TEXT) || encoding.equals(ENCODING_BINARY))
{
return new String(data);
}
else if (encoding.equals(ENCODING_UNICODE))
{
// convert double-byte encoding to single byte for String conversion
byte[] b = new byte[data.length >> 1];
for (int i=0; i<b.length; i++)
{
b[i] = data[i << 1];
}
return new String(b);
}
else
{
return new String(data);
}
}
private String type;
private String encoding;
private DocumentInputStream stream;
}
}

View File

@ -0,0 +1,107 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.transform;
import java.io.File;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.filestore.FileContentReader;
import org.alfresco.repo.content.filestore.FileContentWriter;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.util.TempFileProvider;
/**
* @see org.alfresco.repo.content.transform.MailContentTransformer
*
* @author Kevin Roast
*/
public class MailContentTransformerTest extends AbstractContentTransformerTest
{
private ContentTransformer transformer;
@Override
public void setUp() throws Exception
{
super.setUp();
transformer = new MailContentTransformer();
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
{
return transformer;
}
public void testReliability() throws Exception
{
double reliability = 0.0;
reliability = transformer.getReliability(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_RFC822);
assertEquals("Mimetype should not be supported", 0.0, reliability);
reliability = transformer.getReliability(MimetypeMap.MIMETYPE_RFC822, MimetypeMap.MIMETYPE_TEXT_PLAIN);
assertEquals("Mimetype should be supported", 1.0, reliability);
}
/**
* Test transforming a valid msg file to text
*/
public void testMsgToText() throws Exception
{
File msgSourceFile = loadQuickTestFile("msg");
File txtTargetFile = TempFileProvider.createTempFile(getName() + "-target-1", ".txt");
ContentReader reader = new FileContentReader(msgSourceFile);
reader.setMimetype(MimetypeMap.MIMETYPE_RFC822);
ContentWriter writer = new FileContentWriter(txtTargetFile);
writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
transformer.transform(reader, writer);
ContentReader reader2 = new FileContentReader(txtTargetFile);
reader2.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
assertTrue(reader2.getContentString().contains(QUICK_CONTENT));
}
/**
* Test transforming a valid unicode msg file to text
*/
public void testUnicodeMsgToText() throws Exception
{
File msgSourceFile = loadQuickTestFile("unicode.msg");
File txtTargetFile = TempFileProvider.createTempFile(getName() + "-target-2", ".txt");
ContentReader reader = new FileContentReader(msgSourceFile);
reader.setMimetype(MimetypeMap.MIMETYPE_RFC822);
ContentWriter writer = new FileContentWriter(txtTargetFile);
writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
transformer.transform(reader, writer);
ContentReader reader2 = new FileContentReader(txtTargetFile);
reader2.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
assertTrue(reader2.getContentString().contains(QUICK_CONTENT));
}
}

View File

@ -49,7 +49,7 @@ public class PdfBoxContentTransformer extends AbstractContentTransformer
// TODO: Expand PDFBox usage to convert images to PDF and investigate other conversions
if (!MimetypeMap.MIMETYPE_PDF.equals(sourceMimetype) ||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
{
// only support PDF -> Text
return 0.0;

Binary file not shown.

Binary file not shown.