mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-06-23 18:05:32 +00:00
Outlook email messages (in OLE2 .msg format) now converted to text for full-text indexing.
JUnit test for new transformer class. Added new test to ContentTestSuite. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@5951 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
parent
ede9345a5b
commit
04a78f17d2
@ -242,6 +242,10 @@
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<bean id="transformer.OutlookMsg"
|
||||
class="org.alfresco.repo.content.transform.MailContentTransformer"
|
||||
parent="baseContentTransformer" />
|
||||
|
||||
<!--
|
||||
<bean id="transformer.JMagick" class="org.alfresco.repo.content.transform.magick.JMagickContentTransformer" init-method="init" />
|
||||
|
@ -41,6 +41,7 @@ import org.alfresco.repo.content.transform.ComplexContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.ContentTransformerRegistryTest;
|
||||
import org.alfresco.repo.content.transform.HtmlParserContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.OpenOfficeContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.MailContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.PdfBoxContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.PoiHssfContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerTest;
|
||||
@ -83,6 +84,7 @@ public class ContentTestSuite extends TestSuite
|
||||
suite.addTestSuite(RuntimeExecutableContentTransformerTest.class);
|
||||
suite.addTestSuite(StringExtractingContentTransformerTest.class);
|
||||
suite.addTestSuite(TextMiningContentTransformerTest.class);
|
||||
suite.addTestSuite(MailContentTransformerTest.class);
|
||||
suite.addTestSuite(ContentDataTest.class);
|
||||
suite.addTestSuite(MimetypeMapTest.class);
|
||||
suite.addTestSuite(RoutingContentServiceTest.class);
|
||||
|
@ -72,6 +72,7 @@ public class MimetypeMap implements MimetypeService
|
||||
public static final String MIMETYPE_OPENSEARCH_DESCRIPTION = "application/opensearchdescription+xml";
|
||||
public static final String MIMETYPE_ATOM = "application/atom+xml";
|
||||
public static final String MIMETYPE_RSS = "application/rss+xml";
|
||||
public static final String MIMETYPE_RFC822 = "message/rfc822";
|
||||
// Open Document
|
||||
public static final String MIMETYPE_OPENDOCUMENT_TEXT = "application/vnd.oasis.opendocument.text";
|
||||
public static final String MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE = "application/vnd.oasis.opendocument.text-template";
|
||||
|
@ -716,7 +716,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
||||
* strings are
|
||||
* <ul>
|
||||
* <li><b>Null:</b> Removed</li>
|
||||
* <li><b>Emtpty String:</b> Passed to the {@link OverwritePolicy}</li>
|
||||
* <li><b>Empty String:</b> Passed to the {@link OverwritePolicy}</li>
|
||||
* <li><b>Non Serializable:</b> Converted to String or fails if that is not possible</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
|
@ -102,11 +102,12 @@ public class MailMetadataExtracter extends AbstractMetadataExtracter
|
||||
catch (IOException err)
|
||||
{
|
||||
// probably not an Outlook format MSG - ignore for now
|
||||
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
|
||||
if (logger.isWarnEnabled())
|
||||
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
|
||||
}
|
||||
|
||||
// store multi-value extracted property
|
||||
if (receipientEmails.get().size() != 0)
|
||||
if (this.receipientEmails.get().size() != 0)
|
||||
{
|
||||
destination.put(ContentModel.PROP_ADDRESSEES, (Serializable)receipientEmails.get());
|
||||
}
|
||||
|
@ -56,7 +56,7 @@ import org.springframework.context.ApplicationContext;
|
||||
*/
|
||||
public abstract class AbstractContentTransformerTest extends TestCase
|
||||
{
|
||||
private static String QUICK_CONTENT = "The quick brown fox jumps over the lazy dog";
|
||||
protected static String QUICK_CONTENT = "The quick brown fox jumps over the lazy dog";
|
||||
private static String[] QUICK_WORDS = new String[] {
|
||||
"quick", "brown", "fox", "jumps", "lazy", "dog"};
|
||||
|
||||
|
@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2007 Alfresco Software Limited.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
* As a special exception to the terms and conditions of version 2.0 of
|
||||
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||
* FLOSS exception. You should have recieved a copy of the text describing
|
||||
* the FLOSS exception, and it is also available here:
|
||||
* http://www.alfresco.com/legal/licensing
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
|
||||
import org.apache.poi.poifs.filesystem.DocumentInputStream;
|
||||
|
||||
/**
|
||||
* Outlook email msg format to-text transformer.
|
||||
*
|
||||
* @author Kevin Roast
|
||||
*/
|
||||
public class MailContentTransformer extends AbstractContentTransformer
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(MailContentTransformer.class);
|
||||
|
||||
private static final String STREAM_PREFIX = "__substg1.0_";
|
||||
private static final int STREAM_PREFIX_LENGTH = STREAM_PREFIX.length();
|
||||
|
||||
/**
|
||||
* Only support MSG to text
|
||||
*/
|
||||
public double getReliability(String sourceMimetype, String targetMimetype)
|
||||
{
|
||||
if (!MimetypeMap.MIMETYPE_RFC822.equals(sourceMimetype) ||
|
||||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
||||
{
|
||||
// only support MSG -> TEXT
|
||||
return 0.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
return 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.AbstractContentTransformer#transformInternal(org.alfresco.service.cmr.repository.ContentReader, org.alfresco.service.cmr.repository.ContentWriter, java.util.Map)
|
||||
*/
|
||||
@Override
|
||||
protected void transformInternal(final ContentReader reader, final ContentWriter writer, Map<String, Object> options)
|
||||
throws Exception
|
||||
{
|
||||
POIFSReaderListener readerListener = new POIFSReaderListener()
|
||||
{
|
||||
public void processPOIFSReaderEvent(final POIFSReaderEvent event)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (event.getName().startsWith(STREAM_PREFIX))
|
||||
{
|
||||
StreamHandler handler = new StreamHandler(event.getName(), event.getStream());
|
||||
String result = handler.process();
|
||||
if (result != null)
|
||||
{
|
||||
writer.putContent(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
POIFSReader poiFSReader = new POIFSReader();
|
||||
poiFSReader.registerListener(readerListener);
|
||||
|
||||
try
|
||||
{
|
||||
poiFSReader.read(is);
|
||||
}
|
||||
catch (IOException err)
|
||||
{
|
||||
// probably not an Outlook format MSG - ignore for now
|
||||
if (logger.isWarnEnabled())
|
||||
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (IOException e) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final String ENCODING_TEXT = "001E";
|
||||
private static final String ENCODING_BINARY = "0102";
|
||||
private static final String ENCODING_UNICODE = "001F";
|
||||
|
||||
private static final String SUBSTG_MESSAGEBODY = "1000";
|
||||
|
||||
/**
|
||||
* Class to handle stream types. Can process and extract specific streams.
|
||||
*/
|
||||
private class StreamHandler
|
||||
{
|
||||
StreamHandler(String name, DocumentInputStream stream)
|
||||
{
|
||||
this.type = name.substring(STREAM_PREFIX_LENGTH, STREAM_PREFIX_LENGTH + 4);
|
||||
this.encoding = name.substring(STREAM_PREFIX_LENGTH + 4, STREAM_PREFIX_LENGTH + 8);
|
||||
this.stream = stream;
|
||||
}
|
||||
|
||||
String process()
|
||||
throws IOException
|
||||
{
|
||||
String result = null;
|
||||
|
||||
if (SUBSTG_MESSAGEBODY.equals(this.type))
|
||||
{
|
||||
result = extractText(this.encoding);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the text from the stream based on the encoding
|
||||
*
|
||||
* @return String
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
private String extractText(String encoding)
|
||||
throws IOException
|
||||
{
|
||||
byte[] data = new byte[this.stream.available()];
|
||||
this.stream.read(data);
|
||||
|
||||
if (encoding.equals(ENCODING_TEXT) || encoding.equals(ENCODING_BINARY))
|
||||
{
|
||||
return new String(data);
|
||||
}
|
||||
else if (encoding.equals(ENCODING_UNICODE))
|
||||
{
|
||||
// convert double-byte encoding to single byte for String conversion
|
||||
byte[] b = new byte[data.length >> 1];
|
||||
for (int i=0; i<b.length; i++)
|
||||
{
|
||||
b[i] = data[i << 1];
|
||||
}
|
||||
return new String(b);
|
||||
}
|
||||
else
|
||||
{
|
||||
return new String(data);
|
||||
}
|
||||
}
|
||||
|
||||
private String type;
|
||||
private String encoding;
|
||||
private DocumentInputStream stream;
|
||||
}
|
||||
}
|
@ -0,0 +1,107 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2007 Alfresco Software Limited.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
* As a special exception to the terms and conditions of version 2.0 of
|
||||
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||
* FLOSS exception. You should have recieved a copy of the text describing
|
||||
* the FLOSS exception, and it is also available here:
|
||||
* http://www.alfresco.com/legal/licensing"
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.repo.content.filestore.FileContentReader;
|
||||
import org.alfresco.repo.content.filestore.FileContentWriter;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.MailContentTransformer
|
||||
*
|
||||
* @author Kevin Roast
|
||||
*/
|
||||
public class MailContentTransformerTest extends AbstractContentTransformerTest
|
||||
{
|
||||
private ContentTransformer transformer;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception
|
||||
{
|
||||
super.setUp();
|
||||
|
||||
transformer = new MailContentTransformer();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the same transformer regardless - it is allowed
|
||||
*/
|
||||
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
|
||||
{
|
||||
return transformer;
|
||||
}
|
||||
|
||||
public void testReliability() throws Exception
|
||||
{
|
||||
double reliability = 0.0;
|
||||
reliability = transformer.getReliability(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_RFC822);
|
||||
assertEquals("Mimetype should not be supported", 0.0, reliability);
|
||||
reliability = transformer.getReliability(MimetypeMap.MIMETYPE_RFC822, MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
assertEquals("Mimetype should be supported", 1.0, reliability);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test transforming a valid msg file to text
|
||||
*/
|
||||
public void testMsgToText() throws Exception
|
||||
{
|
||||
File msgSourceFile = loadQuickTestFile("msg");
|
||||
File txtTargetFile = TempFileProvider.createTempFile(getName() + "-target-1", ".txt");
|
||||
ContentReader reader = new FileContentReader(msgSourceFile);
|
||||
reader.setMimetype(MimetypeMap.MIMETYPE_RFC822);
|
||||
ContentWriter writer = new FileContentWriter(txtTargetFile);
|
||||
writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
|
||||
transformer.transform(reader, writer);
|
||||
|
||||
ContentReader reader2 = new FileContentReader(txtTargetFile);
|
||||
reader2.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
assertTrue(reader2.getContentString().contains(QUICK_CONTENT));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test transforming a valid unicode msg file to text
|
||||
*/
|
||||
public void testUnicodeMsgToText() throws Exception
|
||||
{
|
||||
File msgSourceFile = loadQuickTestFile("unicode.msg");
|
||||
File txtTargetFile = TempFileProvider.createTempFile(getName() + "-target-2", ".txt");
|
||||
ContentReader reader = new FileContentReader(msgSourceFile);
|
||||
reader.setMimetype(MimetypeMap.MIMETYPE_RFC822);
|
||||
ContentWriter writer = new FileContentWriter(txtTargetFile);
|
||||
writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
|
||||
transformer.transform(reader, writer);
|
||||
|
||||
ContentReader reader2 = new FileContentReader(txtTargetFile);
|
||||
reader2.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
assertTrue(reader2.getContentString().contains(QUICK_CONTENT));
|
||||
}
|
||||
}
|
@ -49,7 +49,7 @@ public class PdfBoxContentTransformer extends AbstractContentTransformer
|
||||
// TODO: Expand PDFBox usage to convert images to PDF and investigate other conversions
|
||||
|
||||
if (!MimetypeMap.MIMETYPE_PDF.equals(sourceMimetype) ||
|
||||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
||||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
||||
{
|
||||
// only support PDF -> Text
|
||||
return 0.0;
|
||||
|
BIN
source/test-resources/quick/quick.msg
Normal file
BIN
source/test-resources/quick/quick.msg
Normal file
Binary file not shown.
BIN
source/test-resources/quick/quick.unicode.msg
Normal file
BIN
source/test-resources/quick/quick.unicode.msg
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user