. Added support for additional encodings to Outlook email extractor

- fixes issue with some emails not being parsed
. Added email subject to list of extracted values - "cm:subjectline"

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@3418 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Kevin Roast
2006-07-26 20:21:16 +00:00
parent af2b0b5993
commit b78cb33649
2 changed files with 106 additions and 54 deletions

View File

@@ -660,6 +660,10 @@
<type>d:text</type>
<multiple>true</multiple>
</property>
<property name="cm:subjectline">
<title>Subject</title>
<type>d:text</type>
</property>
<property name="cm:sentdate">
<title>Sent Date</title>
<type>d:datetime</type>

View File

@@ -45,20 +45,19 @@ public class MailMetadataExtracter extends AbstractMetadataExtracter
public static String[] SUPPORTED_MIMETYPES = new String[] {
"message/rfc822"};
private static final String SUBSTG_MESSAGEBODY = "__substg1.0_1000001E";
private static final String SUBSTG_RECIPIENTEMAIL = "__substg1.0_39FE001E";
private static final String SUBSTG_RECEIVEDEMAIL = "__substg1.0_0076001E";
private static final String SUBSTG_SENDEREMAIL = "__substg1.0_0C1F001E";
private static final String SUBSTG_DATE = "__substg1.0_00470102";
private static final String STREAM_PREFIX = "__substg1.0_";
private static final int STREAM_PREFIX_LENGTH = STREAM_PREFIX.length();
private static final QName ASPECT_MAILED = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "emailed");
private static final QName PROP_SENTDATE = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "sentdate");
private static final QName PROP_ORIGINATOR = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "originator");
private static final QName PROP_ADDRESSEE = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "addressee");
private static final QName PROP_ADDRESSEES = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "addressees");
private static final QName PROP_SUBJECT = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "subjectline");
// the CC: email addresses
private ThreadLocal<List<String>> receipientEmails = new ThreadLocal<List<String>>();
//private StringBuilder debug = null;
public MailMetadataExtracter()
{
@@ -73,27 +72,112 @@ public class MailMetadataExtracter extends AbstractMetadataExtracter
{
try
{
String name = event.getName();
//String name = event.getName();
//String path = event.getPath().toString();
//debug.append(path).append(" - ").append(name).append(" (").append(event.getStream().available()).append(")\r\n");
if (name.equals(SUBSTG_RECIPIENTEMAIL)) // a recipient email address
{
String emailAddress = readPlainTextStream(event.getStream());
receipientEmails.get().add(convertExchangeAddress(emailAddress));
StreamHandler handler = new StreamHandler(event.getName(), event.getStream());
handler.process(destination);
}
else if (name.equals(SUBSTG_RECEIVEDEMAIL)) // receiver email address
catch (Exception ex)
{
String emailAddress = readPlainTextStream(event.getStream());
destination.put(PROP_ADDRESSEE, convertExchangeAddress(emailAddress));
throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
}
else if (name.equals(SUBSTG_SENDEREMAIL)) // sender email - NOTE either email OR full Exchange data e.g. : /O=HOSTEDSERVICE2/OU=FIRST ADMINISTRATIVE GROUP/CN=RECIPIENTS/CN=MIKE.FARMAN@BEN
}
};
InputStream is = null;
try
{
String emailAddress = readPlainTextStream(event.getStream());
destination.put(PROP_ORIGINATOR, convertExchangeAddress(emailAddress));
//debug = new StringBuilder(1024);
this.receipientEmails.set(new ArrayList<String>());
is = reader.getContentInputStream();
POIFSReader poiFSReader = new POIFSReader();
poiFSReader.registerListener(readerListener);
try
{
poiFSReader.read(is);
}
else if (name.equals(SUBSTG_DATE))
catch (IOException err)
{
// probably not an Outlook format MSG - ignore for now
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
}
// store multi-value extracted property
if (receipientEmails.get().size() != 0)
{
destination.put(PROP_ADDRESSEES, (Serializable)receipientEmails.get());
}
//logger.warn(debug);
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
}
}
private static String convertExchangeAddress(String email)
{
if (email.lastIndexOf("/CN=") == -1)
{
return email;
}
else
{
// found a full Exchange format To header
return email.substring(email.lastIndexOf("/CN=") + 4);
}
}
private static final String ENCODING_TEXT = "001E";
private static final String ENCODING_BINARY = "0102";
private static final String ENCODING_UNICODE = "001F";
private static final String SUBSTG_MESSAGEBODY = "1000";
private static final String SUBSTG_RECIPIENTEMAIL = "39FE";
private static final String SUBSTG_RECEIVEDEMAIL = "0076";
private static final String SUBSTG_SENDEREMAIL = "0C1F";
private static final String SUBSTG_DATE = "0047";
private static final String SUBSTG_SUBJECT = "0037";
private class StreamHandler
{
StreamHandler(String name, DocumentInputStream stream)
{
this.type = name.substring(STREAM_PREFIX_LENGTH, STREAM_PREFIX_LENGTH + 4);
this.encoding = name.substring(STREAM_PREFIX_LENGTH + 4, STREAM_PREFIX_LENGTH + 8);
this.stream = stream;
}
void process(final Map<QName, Serializable> destination)
throws IOException
{
if (type.equals(SUBSTG_SENDEREMAIL))
{
destination.put(PROP_ORIGINATOR, convertExchangeAddress(extractText()));
}
else if (type.equals(SUBSTG_RECIPIENTEMAIL))
{
receipientEmails.get().add(convertExchangeAddress(extractText()));
}
else if (type.equals(SUBSTG_RECEIVEDEMAIL))
{
destination.put(PROP_ADDRESSEE, convertExchangeAddress(extractText()));
}
else if (type.equals(SUBSTG_SUBJECT))
{
destination.put(PROP_SUBJECT, extractText());
}
else if (type.equals(SUBSTG_DATE))
{
// the date is not really plain text - but it's easier to parse as such
String date = readPlainTextStream(event.getStream());
String date = extractText();
int valueIndex = date.indexOf("l=");
if (valueIndex != -1)
{
@@ -116,65 +200,29 @@ public class MailMetadataExtracter extends AbstractMetadataExtracter
}
}
}
catch (Exception ex)
{
throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
}
}
};
InputStream is = null;
try
{
this.receipientEmails.set(new ArrayList<String>());
is = reader.getContentInputStream();
POIFSReader poiFSReader = new POIFSReader();
poiFSReader.registerListener(readerListener);
try
{
poiFSReader.read(is);
}
catch (IOException err)
{
// probably not an Outlook format MSG - ignore for now
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
}
// store multi-value extracted property
if (receipientEmails.get().size() != 0)
{
destination.put(PROP_ADDRESSEES, (Serializable)receipientEmails.get());
}
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
}
}
private static String readPlainTextStream(DocumentInputStream stream)
private String extractText()
throws IOException
{
byte[] data = new byte[stream.available()];
int read = stream.read(data);
stream.read(data);
if (this.encoding.equals(ENCODING_TEXT) || this.encoding.equals(ENCODING_BINARY))
{
return new String(data);
}
private static String convertExchangeAddress(String email)
{
if (email.lastIndexOf("/CN=") == -1)
{
return email;
}
else
{
// found a full Exchange format To header
return email.substring(email.lastIndexOf("/CN=") + 4);
byte[] b = new byte[data.length >> 1];
for (int i=0; i<b.length; i++)
{
b[i] = data[i << 1];
}
return new String(b);
}
}
private String type;
private String encoding;
private DocumentInputStream stream;
}
}