mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-24 17:32:48 +00:00
. Outlook email format meta-data extractor
- expects .msg files in native Outlook format - uses POI library for the parsing of the horrid OLE2 compound document format - extracts addressee(s), sent date and originator email address ...for the future - could be modified and used as a transformer to allow full-text indexing of Outlook format emails . Add new aspect "emailed" to the contentmodel to support properties for above extractor git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@3387 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
@@ -31,11 +31,11 @@ import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
abstract public class AbstractMetadataExtracter implements MetadataExtracter
|
||||
{
|
||||
private static Log logger = LogFactory.getLog(AbstractMetadataExtracter.class);
|
||||
protected static Log logger = LogFactory.getLog(AbstractMetadataExtracter.class);
|
||||
|
||||
private MimetypeService mimetypeService;
|
||||
private MetadataExtracterRegistry registry;
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
@@ -38,7 +38,7 @@ import org.springframework.context.ApplicationContext;
|
||||
* @see org.alfresco.repo.content.metadata.MetadataExtracter
|
||||
* @see org.alfresco.repo.content.metadata.AbstractMetadataExtracter
|
||||
*
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public abstract class AbstractMetadataExtracterTest extends TestCase
|
||||
{
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
@@ -38,7 +38,7 @@ import org.alfresco.service.namespace.QName;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public class HtmlMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
@@ -19,7 +19,7 @@ package org.alfresco.repo.content.metadata;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
|
||||
/**
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public class HtmlMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
|
@@ -0,0 +1,180 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.NamespaceService;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
|
||||
import org.apache.poi.poifs.filesystem.DocumentInputStream;
|
||||
|
||||
/**
|
||||
* Outlook format email meta-data extractor
|
||||
*
|
||||
* @author Kevin Roast
|
||||
*/
|
||||
public class MailMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
public static String[] SUPPORTED_MIMETYPES = new String[] {
|
||||
"message/rfc822"};
|
||||
|
||||
private static final String SUBSTG_MESSAGEBODY = "__substg1.0_1000001E";
|
||||
private static final String SUBSTG_RECIPIENTEMAIL = "__substg1.0_39FE001E";
|
||||
private static final String SUBSTG_RECEIVEDEMAIL = "__substg1.0_0076001E";
|
||||
private static final String SUBSTG_SENDEREMAIL = "__substg1.0_0C1F001E";
|
||||
private static final String SUBSTG_DATE = "__substg1.0_00470102";
|
||||
|
||||
private static final QName ASPECT_MAILED = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "emailed");
|
||||
private static final QName PROP_SENTDATE = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "sentdate");
|
||||
private static final QName PROP_ORIGINATOR = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "originator");
|
||||
private static final QName PROP_ADDRESSEE = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "addressee");
|
||||
private static final QName PROP_ADDRESSEES = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "addressees");
|
||||
|
||||
// the CC: email addresses
|
||||
private ThreadLocal<List<String>> receipientEmails = new ThreadLocal<List<String>>();
|
||||
|
||||
public MailMetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)), 1.0, 1000);
|
||||
}
|
||||
|
||||
public void extractInternal(ContentReader reader, final Map<QName, Serializable> destination) throws Throwable
|
||||
{
|
||||
POIFSReaderListener readerListener = new POIFSReaderListener()
|
||||
{
|
||||
public void processPOIFSReaderEvent(final POIFSReaderEvent event)
|
||||
{
|
||||
try
|
||||
{
|
||||
String name = event.getName();
|
||||
|
||||
if (name.equals(SUBSTG_RECIPIENTEMAIL)) // a recipient email address
|
||||
{
|
||||
String emailAddress = readPlainTextStream(event.getStream());
|
||||
receipientEmails.get().add(convertExchangeAddress(emailAddress));
|
||||
}
|
||||
else if (name.equals(SUBSTG_RECEIVEDEMAIL)) // receiver email address
|
||||
{
|
||||
String emailAddress = readPlainTextStream(event.getStream());
|
||||
destination.put(PROP_ADDRESSEE, convertExchangeAddress(emailAddress));
|
||||
}
|
||||
else if (name.equals(SUBSTG_SENDEREMAIL)) // sender email - NOTE either email OR full Exchange data e.g. : /O=HOSTEDSERVICE2/OU=FIRST ADMINISTRATIVE GROUP/CN=RECIPIENTS/CN=MIKE.FARMAN@BEN
|
||||
{
|
||||
String emailAddress = readPlainTextStream(event.getStream());
|
||||
destination.put(PROP_ORIGINATOR, convertExchangeAddress(emailAddress));
|
||||
}
|
||||
else if (name.equals(SUBSTG_DATE))
|
||||
{
|
||||
// the date is not really plain text - but it's easier to parse as such
|
||||
String date = readPlainTextStream(event.getStream());
|
||||
int valueIndex = date.indexOf("l=");
|
||||
if (valueIndex != -1)
|
||||
{
|
||||
int dateIndex = date.indexOf('-', valueIndex);
|
||||
if (dateIndex != -1)
|
||||
{
|
||||
dateIndex++;
|
||||
String strYear = date.substring(dateIndex, dateIndex + 2);
|
||||
int year = Integer.parseInt(strYear) + (2000 - 1900);
|
||||
String strMonth = date.substring(dateIndex + 2, dateIndex + 4);
|
||||
int month = Integer.parseInt(strMonth) - 1;
|
||||
String strDay = date.substring(dateIndex + 4, dateIndex + 6);
|
||||
int day = Integer.parseInt(strDay);
|
||||
String strHour = date.substring(dateIndex + 6, dateIndex + 8);
|
||||
int hour = Integer.parseInt(strHour);
|
||||
String strMinute = date.substring(dateIndex + 10, dateIndex + 12);
|
||||
int minute = Integer.parseInt(strMinute);
|
||||
destination.put(PROP_SENTDATE, new Date(year, month, day, hour, minute));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
this.receipientEmails.set(new ArrayList<String>());
|
||||
|
||||
is = reader.getContentInputStream();
|
||||
POIFSReader poiFSReader = new POIFSReader();
|
||||
poiFSReader.registerListener(readerListener);
|
||||
|
||||
try
|
||||
{
|
||||
poiFSReader.read(is);
|
||||
}
|
||||
catch (IOException err)
|
||||
{
|
||||
// probably not an Outlook format MSG - ignore for now
|
||||
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
|
||||
}
|
||||
|
||||
// store multi-value extracted property
|
||||
if (receipientEmails.get().size() != 0)
|
||||
{
|
||||
destination.put(PROP_ADDRESSEES, (Serializable)receipientEmails.get());
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (IOException e) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static String readPlainTextStream(DocumentInputStream stream)
|
||||
throws IOException
|
||||
{
|
||||
byte[] data = new byte[stream.available()];
|
||||
int read = stream.read(data);
|
||||
return new String(data);
|
||||
}
|
||||
|
||||
private static String convertExchangeAddress(String email)
|
||||
{
|
||||
if (email.lastIndexOf("/CN=") == -1)
|
||||
{
|
||||
return email;
|
||||
}
|
||||
else
|
||||
{
|
||||
// found a full Exchange format To header
|
||||
return email.substring(email.lastIndexOf("/CN=") + 4);
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
@@ -25,7 +25,7 @@ import org.alfresco.service.namespace.QName;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public interface MetadataExtracter
|
||||
{
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
@@ -36,7 +36,7 @@ import org.apache.commons.logging.LogFactory;
|
||||
* The extracters themselves know how well they are able to extract metadata.
|
||||
*
|
||||
* @see org.alfresco.repo.content.metadata.MetadataExtracter
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public class MetadataExtracterRegistry
|
||||
{
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
@@ -28,7 +28,6 @@ import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.poi.hpsf.DocumentSummaryInformation;
|
||||
import org.apache.poi.hpsf.PropertySet;
|
||||
import org.apache.poi.hpsf.PropertySetFactory;
|
||||
import org.apache.poi.hpsf.SummaryInformation;
|
||||
@@ -37,15 +36,16 @@ import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
|
||||
|
||||
/**
|
||||
* Office file format Metadata Extracter
|
||||
*
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public class OfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
public static String[] SUPPORTED_MIMETYPES = new String[] {
|
||||
MimetypeMap.MIMETYPE_WORD,
|
||||
MimetypeMap.MIMETYPE_EXCEL,
|
||||
MimetypeMap.MIMETYPE_PPT };
|
||||
MimetypeMap.MIMETYPE_PPT};
|
||||
|
||||
public OfficeMetadataExtracter()
|
||||
{
|
||||
@@ -64,6 +64,7 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
if (ps instanceof SummaryInformation)
|
||||
{
|
||||
SummaryInformation si = (SummaryInformation) ps;
|
||||
|
||||
// Titled aspect
|
||||
trimPut(ContentModel.PROP_TITLE, si.getTitle(), destination);
|
||||
trimPut(ContentModel.PROP_DESCRIPTION, si.getSubject(), destination);
|
||||
@@ -73,16 +74,6 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
trimPut(ContentModel.PROP_MODIFIED, si.getLastSaveDateTime(), destination);
|
||||
trimPut(ContentModel.PROP_AUTHOR, si.getAuthor(), destination);
|
||||
}
|
||||
else if (ps instanceof DocumentSummaryInformation)
|
||||
{
|
||||
// DocumentSummaryInformation dsi = (DocumentSummaryInformation) ps;
|
||||
|
||||
// These are not really interesting to any aspect:
|
||||
// trimPut(ContentModel.PROP_xxx, dsi.getCompany(),
|
||||
// destination);
|
||||
// trimPut(ContentModel.PROP_yyy, dsi.getManager(),
|
||||
// destination);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@@ -90,6 +81,7 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
|
@@ -4,7 +4,7 @@ package org.alfresco.repo.content.metadata;
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.OfficeMetadataExtracter
|
||||
*
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
|
@@ -42,7 +42,7 @@ import com.sun.star.ucb.XFileIdentifierConverter;
|
||||
import com.sun.star.uno.UnoRuntime;
|
||||
|
||||
/**
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public class OpenOfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
|
@@ -21,7 +21,7 @@ import net.sf.jooreports.openoffice.connection.SocketOpenOfficeConnection;
|
||||
|
||||
|
||||
/**
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public class OpenOfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
@@ -31,7 +31,7 @@ import org.pdfbox.pdmodel.PDDocumentInformation;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public class PdfBoxMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
|
@@ -5,7 +5,7 @@ import org.alfresco.repo.content.MimetypeMap;
|
||||
/**
|
||||
* @see org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter
|
||||
*
|
||||
* @author Jesper Steen M<EFBFBD>ller
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
|
Reference in New Issue
Block a user