mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Update metadata extractors - Outlook, MP3, Mail and PDF improvements, and increase test coverage
git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@18454 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -27,20 +27,13 @@ package org.alfresco.repo.content.metadata;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Calendar;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
|
||||
import org.apache.poi.poifs.filesystem.DocumentInputStream;
|
||||
import org.apache.poi.hsmf.MAPIMessage;
|
||||
|
||||
/**
|
||||
* Outlook format email meta-data extractor extracting the following values:
|
||||
@@ -52,6 +45,9 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
|
||||
* <b>subjectLine:</b> -- cm:subjectline, cm:description
|
||||
* </pre>
|
||||
*
|
||||
* TIKA note - to/cc/bcc go into the html part, not the metadata.
|
||||
* Also, email addresses not included as yet.
|
||||
*
|
||||
* @since 2.1
|
||||
* @author Kevin Roast
|
||||
*/
|
||||
@@ -65,12 +61,6 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
|
||||
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_OUTLOOK_MSG};
|
||||
|
||||
private static final String STREAM_PREFIX = "__substg1.0_";
|
||||
private static final int STREAM_PREFIX_LENGTH = STREAM_PREFIX.length();
|
||||
|
||||
// the CC: email addresses
|
||||
private ThreadLocal<List<String>> receipientEmails = new ThreadLocal<List<String>>();
|
||||
|
||||
public MailMetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
|
||||
@@ -81,37 +71,25 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
{
|
||||
final Map<String, Serializable> rawProperties = newRawMap();
|
||||
|
||||
POIFSReaderListener readerListener = new POIFSReaderListener()
|
||||
{
|
||||
public void processPOIFSReaderEvent(final POIFSReaderEvent event)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (event.getName().startsWith(STREAM_PREFIX))
|
||||
{
|
||||
StreamHandler handler = new StreamHandler(event.getName(), event.getStream());
|
||||
handler.process(rawProperties);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
this.receipientEmails.set(new ArrayList<String>());
|
||||
|
||||
is = reader.getContentInputStream();
|
||||
POIFSReader poiFSReader = new POIFSReader();
|
||||
poiFSReader.registerListener(readerListener);
|
||||
MAPIMessage msg;
|
||||
|
||||
try
|
||||
{
|
||||
poiFSReader.read(is);
|
||||
msg = new MAPIMessage(is);
|
||||
msg.setReturnNullOnMissingChunk(true);
|
||||
|
||||
putRawValue(KEY_ORIGINATOR, msg.getDisplayFrom(), rawProperties);
|
||||
putRawValue(KEY_SUBJECT, msg.getSubject(), rawProperties);
|
||||
putRawValue(KEY_SENT_DATE, msg.getMessageDate().getTime(), rawProperties);
|
||||
|
||||
// Store the TO, but not cc/bcc in the addressee field
|
||||
putRawValue(KEY_ADDRESSEE, msg.getDisplayTo(), rawProperties);
|
||||
// But store all email addresses (to/cc/bcc) in the addresses field
|
||||
putRawValue(KEY_ADDRESSEES, msg.getRecipientEmailAddressList(), rawProperties);
|
||||
}
|
||||
catch (IOException err)
|
||||
{
|
||||
@@ -119,12 +97,6 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
if (logger.isWarnEnabled())
|
||||
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
|
||||
}
|
||||
|
||||
// store multi-value extracted property
|
||||
if (this.receipientEmails.get().size() != 0)
|
||||
{
|
||||
putRawValue(KEY_ADDRESSEES, (Serializable)receipientEmails.get(), rawProperties);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
@@ -136,162 +108,4 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
// Done
|
||||
return rawProperties;
|
||||
}
|
||||
|
||||
private static String convertExchangeAddress(String email)
|
||||
{
|
||||
if (email.lastIndexOf("/CN=") == -1)
|
||||
{
|
||||
return email;
|
||||
}
|
||||
else
|
||||
{
|
||||
// found a full Exchange format To header
|
||||
return email.substring(email.lastIndexOf("/CN=") + 4);
|
||||
}
|
||||
}
|
||||
|
||||
private static final String ENCODING_TEXT = "001E";
|
||||
private static final String ENCODING_BINARY = "0102";
|
||||
private static final String ENCODING_UNICODE = "001F";
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
private static final String SUBSTG_MESSAGEBODY = "1000";
|
||||
private static final String SUBSTG_RECIPIENTEMAIL = "39FE"; // 7bit email address
|
||||
private static final String SUBSTG_RECIPIENTSEARCH = "300B"; // address 'search' variant
|
||||
private static final String SUBSTG_RECEIVEDEMAIL = "0076";
|
||||
private static final String SUBSTG_SENDEREMAIL = "0C1F";
|
||||
private static final String SUBSTG_DATE = "0047";
|
||||
private static final String SUBSTG_SUBJECT = "0037";
|
||||
|
||||
/**
|
||||
* Class to handle stream types. Can process and extract specific streams.
|
||||
*/
|
||||
private class StreamHandler
|
||||
{
|
||||
StreamHandler(String name, DocumentInputStream stream)
|
||||
{
|
||||
this.type = name.substring(STREAM_PREFIX_LENGTH, STREAM_PREFIX_LENGTH + 4);
|
||||
this.encoding = name.substring(STREAM_PREFIX_LENGTH + 4, STREAM_PREFIX_LENGTH + 8);
|
||||
this.stream = stream;
|
||||
}
|
||||
|
||||
void process(final Map<String, Serializable> destination)
|
||||
throws IOException
|
||||
{
|
||||
if (type.equals(SUBSTG_SENDEREMAIL))
|
||||
{
|
||||
putRawValue(KEY_ORIGINATOR, convertExchangeAddress(extractText()), destination);
|
||||
}
|
||||
else if (type.equals(SUBSTG_RECIPIENTEMAIL))
|
||||
{
|
||||
receipientEmails.get().add(convertExchangeAddress(extractText()));
|
||||
}
|
||||
else if (type.equals(SUBSTG_RECIPIENTSEARCH))
|
||||
{
|
||||
String email = extractText(ENCODING_TEXT);
|
||||
int smptIndex = email.indexOf("SMTP:");
|
||||
if (smptIndex != -1)
|
||||
{
|
||||
/* also may be used for SUBSTG_RECIPIENTTRANSPORT = "5FF7";
|
||||
with search for SMPT followed by a null char */
|
||||
|
||||
// this is a secondary mechanism for encoding a receipient email address
|
||||
// the 7 bit email address may not have been set by Outlook - so this is needed instead
|
||||
// handle null character at end of string
|
||||
int endIndex = email.length();
|
||||
if (email.codePointAt(email.length() - 1) == 0)
|
||||
{
|
||||
endIndex--;
|
||||
}
|
||||
email = email.substring(smptIndex + 5, endIndex);
|
||||
receipientEmails.get().add(email);
|
||||
}
|
||||
}
|
||||
else if (type.equals(SUBSTG_RECEIVEDEMAIL))
|
||||
{
|
||||
putRawValue(KEY_ADDRESSEE, convertExchangeAddress(extractText()), destination);
|
||||
}
|
||||
else if (type.equals(SUBSTG_SUBJECT))
|
||||
{
|
||||
putRawValue(KEY_SUBJECT, extractText(), destination);
|
||||
}
|
||||
else if (type.equals(SUBSTG_DATE))
|
||||
{
|
||||
// the date is not "really" plain text - but it's appropriate to parse as such
|
||||
String date = extractText(ENCODING_TEXT);
|
||||
int valueIndex = date.indexOf("l=");
|
||||
if (valueIndex != -1)
|
||||
{
|
||||
int dateIndex = date.indexOf('-', valueIndex);
|
||||
if (dateIndex != -1)
|
||||
{
|
||||
dateIndex++;
|
||||
final Calendar c = Calendar.getInstance();
|
||||
String strYear = date.substring(dateIndex, dateIndex + 2);
|
||||
c.set(Calendar.YEAR, Integer.parseInt(strYear) + (2000 - 1900));
|
||||
String strMonth = date.substring(dateIndex + 2, dateIndex + 4);
|
||||
c.set(Calendar.MONTH, Integer.parseInt(strMonth) - 1);
|
||||
String strDay = date.substring(dateIndex + 4, dateIndex + 6);
|
||||
c.set(Calendar.DAY_OF_MONTH, Integer.parseInt(strDay));
|
||||
String strHour = date.substring(dateIndex + 6, dateIndex + 8);
|
||||
c.set(Calendar.HOUR, Integer.parseInt(strHour));
|
||||
String strMinute = date.substring(dateIndex + 10, dateIndex + 12);
|
||||
c.set(Calendar.MINUTE, Integer.parseInt(strMinute));
|
||||
c.set(Calendar.SECOND, 0);
|
||||
putRawValue(KEY_SENT_DATE, c.getTime(), destination);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the text from the stream based on the encoding
|
||||
*
|
||||
* @return String
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
private String extractText()
|
||||
throws IOException
|
||||
{
|
||||
return extractText(this.encoding);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the text from the stream based on the encoding
|
||||
*
|
||||
* @return String
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
private String extractText(String encoding)
|
||||
throws IOException
|
||||
{
|
||||
byte[] data = new byte[stream.available()];
|
||||
stream.read(data);
|
||||
|
||||
if (encoding.equals(ENCODING_TEXT) || encoding.equals(ENCODING_BINARY))
|
||||
{
|
||||
return new String(data);
|
||||
}
|
||||
else if (encoding.equals(ENCODING_UNICODE))
|
||||
{
|
||||
// convert double-byte encoding to single byte for String conversion
|
||||
byte[] b = new byte[data.length >> 1];
|
||||
for (int i=0; i<b.length; i++)
|
||||
{
|
||||
b[i] = data[i << 1];
|
||||
}
|
||||
return new String(b);
|
||||
}
|
||||
else
|
||||
{
|
||||
return new String(data);
|
||||
}
|
||||
}
|
||||
|
||||
private String type;
|
||||
private String encoding;
|
||||
private DocumentInputStream stream;
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user