From bd1e3edf7645b8c546bd2da695f22aae06acc57a Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Thu, 4 Feb 2010 14:42:45 +0000 Subject: [PATCH] Update metadata extractors - Outlook, MP3, Mail and PDF improvements, and increase test coverage git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@18454 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- .../AbstractMappingMetadataExtracter.java | 6 + .../AbstractMetadataExtracterTest.java | 55 ++++- .../metadata/HtmlMetadataExtracter.java | 4 + .../metadata/HtmlMetadataExtracterTest.java | 7 + .../metadata/MP3MetadataExtracter.java | 41 +++- .../metadata/MP3MetadataExtracterTest.java | 120 ++++++++++ .../metadata/MailMetadataExtracter.java | 218 ++---------------- .../metadata/MailMetadataExtracterTest.java | 46 +++- .../metadata/OfficeMetadataExtracter.java | 3 + .../metadata/OfficeMetadataExtracterTest.java | 106 +++++++++ .../OpenDocumentMetadataExtracter.java | 4 + .../OpenDocumentMetadataExtracterTest.java | 28 +++ .../metadata/OpenOfficeMetadataExtracter.java | 3 + .../OpenOfficeMetadataExtracterTest.java | 23 ++ .../metadata/PdfBoxMetadataExtracter.java | 7 + .../metadata/PdfBoxMetadataExtracterTest.java | 33 +++ .../metadata/RFC822MetadataExtracter.java | 22 +- .../RFC822MetadataExtracter.properties | 8 +- .../metadata/RFC822MetadataExtracterTest.java | 198 ++++++++++++++++ 19 files changed, 707 insertions(+), 225 deletions(-) create mode 100644 source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracterTest.java create mode 100644 source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracterTest.java diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java index e5092ba3bf..da29fe8f57 100644 --- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java @@ -787,6 +787,12 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac propertyTypeDef, (Collection) propertyValue); } + else if (propertyValue instanceof Object[]) + { + convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert( + propertyTypeDef, + (Object[]) propertyValue); + } else { convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert( diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java index 130a8a785d..264026e9b3 100644 --- a/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java @@ -56,11 +56,16 @@ import org.springframework.context.ApplicationContext; */ public abstract class AbstractMetadataExtracterTest extends TestCase { + static { + ApplicationContextHelper.setUseLazyLoading(false); + } protected static ApplicationContext ctx = ApplicationContextHelper.getApplicationContext(); protected static final String QUICK_TITLE = "The quick brown fox jumps over the lazy dog"; protected static final String QUICK_DESCRIPTION = "Gym class featuring a brown fox and lazy dog"; protected static final String QUICK_CREATOR = "Nevin Nollop"; + protected static final String QUICK_CREATOR_EMAIL = "nevin.nollop@alfresco.com"; + protected static final String QUICK_PREVIOUS_AUTHOR = "Derek Hulley"; protected MimetypeMap mimetypeMap; protected DictionaryService dictionaryService; @@ -97,12 +102,16 @@ public abstract class AbstractMetadataExtracterTest extends TestCase try { Map properties = extractFromMimetype(mimetype); - // check + // check common metadata testCommonMetadata(mimetype, properties); + // check file-type specific metadata + testFileSpecificMetadata(mimetype, properties); } catch (FileNotFoundException e) { // The test file is not there. We won't fail it. + System.err.println("No test file found for mime type " + mimetype + + ", skipping extraction test - " + e.getMessage()); } } @@ -127,17 +136,46 @@ public abstract class AbstractMetadataExtracterTest extends TestCase return properties; } + /** + * Tests that we can get the common metadata correctly + * from the file. + * You only need to override this if your test data file + * doesn't have the usual Nevin Nollop/quick brown fox + * data in it. + */ protected void testCommonMetadata(String mimetype, Map properties) { - assertEquals( + // One of Creator or Author + if(!skipAuthorCheck()) { + if(properties.containsKey(ContentModel.PROP_CREATOR)) { + assertEquals( + "Property " + ContentModel.PROP_CREATOR + " not found for mimetype " + mimetype, + QUICK_CREATOR, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATOR))); + } else if(properties.containsKey(ContentModel.PROP_AUTHOR)) { + assertEquals( + "Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype, + QUICK_CREATOR, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR))); + } else { + fail("Expected on Property out of " + ContentModel.PROP_CREATOR + " and " + + ContentModel.PROP_AUTHOR + " but found neither of them."); + } + } + + // Title and description + assertEquals( "Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype, QUICK_TITLE, DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE))); - assertEquals( + assertEquals( "Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype, QUICK_DESCRIPTION, DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION))); } + protected abstract void testFileSpecificMetadata(String mimetype, Map properties); + protected boolean skipAuthorCheck() { return false; } + public void testZeroLengthFile() throws Exception { @@ -163,4 +201,15 @@ public abstract class AbstractMetadataExtracterTest extends TestCase assertEquals("There should not be any new properties", 0, properties.size()); } } + + + protected void assertContains(String message, String needle, String haystack) { + if(haystack.indexOf(needle) > -1) { + return; + } + fail(message); + } + protected void assertContains(String needle, String haystack) { + assertContains("'" + needle + "' wasn't found in '" + haystack + "'", needle, haystack); + } } diff --git a/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracter.java index 769c90dd49..e6737c398d 100644 --- a/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracter.java @@ -49,6 +49,10 @@ import org.alfresco.service.cmr.repository.ContentReader; * description: -- cm:description * * + * TIKA note - all metadata will be present, but will need to + * search for the varient names ourselves as tika puts them + * in as-is. + * * @author Jesper Steen Møller * @author Derek Hulley */ diff --git a/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracterTest.java index 4f043c8607..19ae763d37 100644 --- a/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracterTest.java @@ -24,7 +24,11 @@ */ package org.alfresco.repo.content.metadata; +import java.io.Serializable; +import java.util.Map; + import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.namespace.QName; /** * @author Jesper Steen Møller @@ -63,4 +67,7 @@ public class HtmlMetadataExtracterTest extends AbstractMetadataExtracterTest { testExtractFromMimetype(MimetypeMap.MIMETYPE_HTML); } + + /** Extractor only does the usual basic three properties */ + public void testFileSpecificMetadata(String mimetype, Map properties) {} } diff --git a/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java index 1100d581c9..7d72df8cde 100644 --- a/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java @@ -57,6 +57,10 @@ import org.farng.mp3.lyrics3.Lyrics3v2Field; * lyrics: -- {music}lyrics * * + * TIKA Note - title and author go in metadata, but much of the + * rest is only in the text. Some of the ID3v2 parts + * (composer, lyrics) are not yet implemented. + * * @author Roy Wetherall */ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter @@ -91,7 +95,8 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter reader.getContent(tempFile); // Create the MP3 object from the file - MP3File mp3File = new MP3File(tempFile); + // Open it read only as we won't make any changes + MP3File mp3File = new MP3File(tempFile, false); ID3v1 id3v1 = mp3File.getID3v1Tag(); if (id3v1 != null) @@ -141,6 +146,24 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter } } + } + catch(Exception e) + { + if (logger.isDebugEnabled()) + { + logger.debug( + "MP3 Metadata extraction failed: \n" + + " Content: " + reader, + e); + } + else + { + logger.warn( + "MP3 Metadata extraction failed (turn on DEBUG for full error): \n" + + " Content: " + reader + "\n" + + " Failure: " + e.getMessage()); + } + } finally { @@ -167,16 +190,22 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter private String getDescription(Map props) { StringBuilder result = new StringBuilder(); - if (props.get(KEY_SONG_TITLE) != null && props.get(KEY_ARTIST) != null && props.get(KEY_ALBUM_TITLE) != null) + if (props.get(KEY_SONG_TITLE) != null) { - result - .append(props.get(KEY_SONG_TITLE)) + result.append(props.get(KEY_SONG_TITLE)); + if (props.get(KEY_ALBUM_TITLE) != null) + { + result .append(" - ") - .append(props.get(KEY_ALBUM_TITLE)) + .append(props.get(KEY_ALBUM_TITLE)); + } + if (props.get(KEY_ARTIST) != null) + { + result .append(" (") .append(props.get(KEY_ARTIST)) .append(")"); - + } } return result.toString(); diff --git a/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracterTest.java new file mode 100644 index 0000000000..b0314ed198 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracterTest.java @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2005 Jesper Steen Møller + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + * As a special exception to the terms and conditions of version 2.0 of + * the GPL, you may redistribute this Program in connection with Free/Libre + * and Open Source Software ("FLOSS") applications as described in Alfresco's + * FLOSS exception. You should have recieved a copy of the text describing + * the FLOSS exception, and it is also available here: + * http://www.alfresco.com/legal/licensing" + */ +package org.alfresco.repo.content.metadata; + +import java.io.Serializable; +import java.util.Map; + +import org.alfresco.model.ContentModel; +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; +import org.alfresco.service.namespace.QName; + +/** + * Test for the MP3 metadata extraction from id3 tags. + */ +public class MP3MetadataExtracterTest extends AbstractMetadataExtracterTest +{ + private MP3MetadataExtracter extracter; + private static final String ARTIST = "Hauskaz"; + + @Override + public void setUp() throws Exception + { + super.setUp(); + extracter = new MP3MetadataExtracter(); + extracter.setDictionaryService(dictionaryService); + extracter.register(); + } + + /** + * @return Returns the same transformer regardless - it is allowed + */ + protected MetadataExtracter getExtracter() + { + return extracter; + } + + public void testSupports() throws Exception + { + for (String mimetype : MP3MetadataExtracter.SUPPORTED_MIMETYPES) + { + boolean supports = extracter.isSupported(mimetype); + assertTrue("Mimetype should be supported: " + mimetype, supports); + } + } + + public void testMP3Extraction() throws Exception + { + testExtractFromMimetype(MimetypeMap.MIMETYPE_MP3); + } + + /** + * We don't have quite the usual metadata. Tests the descriptions one. + * Other tests in {@link #testFileSpecificMetadata(String, Map)} + */ + protected void testCommonMetadata(String mimetype, Map properties) { + // Title is as normal + assertEquals( + "Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype, + QUICK_TITLE, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE))); + // Has Author, not Creator, and is different + assertEquals( + "Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype, + "Hauskaz", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR))); + + // Description is a composite + assertContains( + "Property " + ContentModel.PROP_DESCRIPTION + " didn't contain " + QUICK_TITLE + " for mimetype " + mimetype, + QUICK_TITLE, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION))); + // Check rest of it later + } + + /** + * Tests for various MP3 specific bits of metadata + */ + public void testFileSpecificMetadata(String mimetype, Map properties) { + QName songTitle = QName.createQName("music","songTitle"); + assertEquals( + "Property " + songTitle + " not found for mimetype " + mimetype, + QUICK_TITLE, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(songTitle))); + + QName songArtist = QName.createQName("music","artist"); + assertEquals( + "Property " + songArtist + " not found for mimetype " + mimetype, + ARTIST, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(songArtist))); + + // Description is a composite - check the artist part + assertContains( + "Property " + ContentModel.PROP_DESCRIPTION + " didn't contain " + ARTIST + " for mimetype " + mimetype, + ARTIST, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION))); + } +} diff --git a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java index b0661970a8..290328bafa 100644 --- a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java @@ -27,20 +27,13 @@ package org.alfresco.repo.content.metadata; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Calendar; import java.util.HashSet; -import java.util.List; import java.util.Map; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentReader; -import org.apache.poi.poifs.eventfilesystem.POIFSReader; -import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; -import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; -import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.hsmf.MAPIMessage; /** * Outlook format email meta-data extractor extracting the following values: @@ -52,6 +45,9 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream; * subjectLine: -- cm:subjectline, cm:description * * + * TIKA note - to/cc/bcc go into the html part, not the metadata. + * Also, email addresses not included as yet. + * * @since 2.1 * @author Kevin Roast */ @@ -65,12 +61,6 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_OUTLOOK_MSG}; - private static final String STREAM_PREFIX = "__substg1.0_"; - private static final int STREAM_PREFIX_LENGTH = STREAM_PREFIX.length(); - - // the CC: email addresses - private ThreadLocal> receipientEmails = new ThreadLocal>(); - public MailMetadataExtracter() { super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES))); @@ -81,37 +71,25 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter { final Map rawProperties = newRawMap(); - POIFSReaderListener readerListener = new POIFSReaderListener() - { - public void processPOIFSReaderEvent(final POIFSReaderEvent event) - { - try - { - if (event.getName().startsWith(STREAM_PREFIX)) - { - StreamHandler handler = new StreamHandler(event.getName(), event.getStream()); - handler.process(rawProperties); - } - } - catch (Exception ex) - { - throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex); - } - } - }; - InputStream is = null; try { - this.receipientEmails.set(new ArrayList()); - is = reader.getContentInputStream(); - POIFSReader poiFSReader = new POIFSReader(); - poiFSReader.registerListener(readerListener); + MAPIMessage msg; try { - poiFSReader.read(is); + msg = new MAPIMessage(is); + msg.setReturnNullOnMissingChunk(true); + + putRawValue(KEY_ORIGINATOR, msg.getDisplayFrom(), rawProperties); + putRawValue(KEY_SUBJECT, msg.getSubject(), rawProperties); + putRawValue(KEY_SENT_DATE, msg.getMessageDate().getTime(), rawProperties); + + // Store the TO, but not cc/bcc in the addressee field + putRawValue(KEY_ADDRESSEE, msg.getDisplayTo(), rawProperties); + // But store all email addresses (to/cc/bcc) in the addresses field + putRawValue(KEY_ADDRESSEES, msg.getRecipientEmailAddressList(), rawProperties); } catch (IOException err) { @@ -119,12 +97,6 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter if (logger.isWarnEnabled()) logger.warn("Unable to extract meta-data from message: " + err.getMessage()); } - - // store multi-value extracted property - if (this.receipientEmails.get().size() != 0) - { - putRawValue(KEY_ADDRESSEES, (Serializable)receipientEmails.get(), rawProperties); - } } finally { @@ -136,162 +108,4 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter // Done return rawProperties; } - - private static String convertExchangeAddress(String email) - { - if (email.lastIndexOf("/CN=") == -1) - { - return email; - } - else - { - // found a full Exchange format To header - return email.substring(email.lastIndexOf("/CN=") + 4); - } - } - - private static final String ENCODING_TEXT = "001E"; - private static final String ENCODING_BINARY = "0102"; - private static final String ENCODING_UNICODE = "001F"; - - @SuppressWarnings("unused") - private static final String SUBSTG_MESSAGEBODY = "1000"; - private static final String SUBSTG_RECIPIENTEMAIL = "39FE"; // 7bit email address - private static final String SUBSTG_RECIPIENTSEARCH = "300B"; // address 'search' variant - private static final String SUBSTG_RECEIVEDEMAIL = "0076"; - private static final String SUBSTG_SENDEREMAIL = "0C1F"; - private static final String SUBSTG_DATE = "0047"; - private static final String SUBSTG_SUBJECT = "0037"; - - /** - * Class to handle stream types. Can process and extract specific streams. - */ - private class StreamHandler - { - StreamHandler(String name, DocumentInputStream stream) - { - this.type = name.substring(STREAM_PREFIX_LENGTH, STREAM_PREFIX_LENGTH + 4); - this.encoding = name.substring(STREAM_PREFIX_LENGTH + 4, STREAM_PREFIX_LENGTH + 8); - this.stream = stream; - } - - void process(final Map destination) - throws IOException - { - if (type.equals(SUBSTG_SENDEREMAIL)) - { - putRawValue(KEY_ORIGINATOR, convertExchangeAddress(extractText()), destination); - } - else if (type.equals(SUBSTG_RECIPIENTEMAIL)) - { - receipientEmails.get().add(convertExchangeAddress(extractText())); - } - else if (type.equals(SUBSTG_RECIPIENTSEARCH)) - { - String email = extractText(ENCODING_TEXT); - int smptIndex = email.indexOf("SMTP:"); - if (smptIndex != -1) - { - /* also may be used for SUBSTG_RECIPIENTTRANSPORT = "5FF7"; - with search for SMPT followed by a null char */ - - // this is a secondary mechanism for encoding a receipient email address - // the 7 bit email address may not have been set by Outlook - so this is needed instead - // handle null character at end of string - int endIndex = email.length(); - if (email.codePointAt(email.length() - 1) == 0) - { - endIndex--; - } - email = email.substring(smptIndex + 5, endIndex); - receipientEmails.get().add(email); - } - } - else if (type.equals(SUBSTG_RECEIVEDEMAIL)) - { - putRawValue(KEY_ADDRESSEE, convertExchangeAddress(extractText()), destination); - } - else if (type.equals(SUBSTG_SUBJECT)) - { - putRawValue(KEY_SUBJECT, extractText(), destination); - } - else if (type.equals(SUBSTG_DATE)) - { - // the date is not "really" plain text - but it's appropriate to parse as such - String date = extractText(ENCODING_TEXT); - int valueIndex = date.indexOf("l="); - if (valueIndex != -1) - { - int dateIndex = date.indexOf('-', valueIndex); - if (dateIndex != -1) - { - dateIndex++; - final Calendar c = Calendar.getInstance(); - String strYear = date.substring(dateIndex, dateIndex + 2); - c.set(Calendar.YEAR, Integer.parseInt(strYear) + (2000 - 1900)); - String strMonth = date.substring(dateIndex + 2, dateIndex + 4); - c.set(Calendar.MONTH, Integer.parseInt(strMonth) - 1); - String strDay = date.substring(dateIndex + 4, dateIndex + 6); - c.set(Calendar.DAY_OF_MONTH, Integer.parseInt(strDay)); - String strHour = date.substring(dateIndex + 6, dateIndex + 8); - c.set(Calendar.HOUR, Integer.parseInt(strHour)); - String strMinute = date.substring(dateIndex + 10, dateIndex + 12); - c.set(Calendar.MINUTE, Integer.parseInt(strMinute)); - c.set(Calendar.SECOND, 0); - putRawValue(KEY_SENT_DATE, c.getTime(), destination); - } - } - } - } - - /** - * Extract the text from the stream based on the encoding - * - * @return String - * - * @throws IOException - */ - private String extractText() - throws IOException - { - return extractText(this.encoding); - } - - /** - * Extract the text from the stream based on the encoding - * - * @return String - * - * @throws IOException - */ - private String extractText(String encoding) - throws IOException - { - byte[] data = new byte[stream.available()]; - stream.read(data); - - if (encoding.equals(ENCODING_TEXT) || encoding.equals(ENCODING_BINARY)) - { - return new String(data); - } - else if (encoding.equals(ENCODING_UNICODE)) - { - // convert double-byte encoding to single byte for String conversion - byte[] b = new byte[data.length >> 1]; - for (int i=0; i properties) { assertEquals( "Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype, - "KEVIN.ROAST@BEN", + "Kevin Roast", DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR))); assertEquals( "Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype, "Test the content transformer", DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION))); } + + /** + * Test the outlook specific bits + */ + protected void testFileSpecificMetadata(String mimetype, + Map properties) { + // Sent Date + assertEquals( + "Property " + ContentModel.PROP_SENTDATE + " not found for mimetype " + mimetype, + "2007-06-14T09:42:55.000+01:00", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_SENTDATE))); + + // Addressee + assertEquals( + "Property " + ContentModel.PROP_ADDRESSEE + " not found for mimetype " + mimetype, + "Kevin Roast", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_ADDRESSEE))); + + // Addressees + Collection addressees = (Collection)properties.get(ContentModel.PROP_ADDRESSEES); + assertTrue( + "Property " + ContentModel.PROP_ADDRESSEES + " not found for mimetype " + mimetype, + addressees != null + ); + assertEquals( + "Property " + ContentModel.PROP_ADDRESSEES + " wrong size for mimetype " + mimetype, + 1, + addressees.size()); + assertEquals( + "Property " + ContentModel.PROP_ADDRESSEES + " wrong content for mimetype " + mimetype, + "kevin.roast@alfresco.org", + DefaultTypeConverter.INSTANCE.convert(String.class, addressees.iterator().next())); + + // Subject Line + assertEquals( + "Property " + ContentModel.PROP_SUBJECT + " not found for mimetype " + mimetype, + "Test the content transformer", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_SUBJECT))); + } } diff --git a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java index 9b878b7917..b3223ba2fe 100644 --- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java @@ -62,6 +62,9 @@ import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; * wordCount: * * + * TIKA Note - everything we currently have should be present + * in the metadata. + * * @author Jesper Steen Møller * @author Derek Hulley */ diff --git a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java index 2108e45e2f..f62ddc14de 100644 --- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java @@ -1,5 +1,16 @@ package org.alfresco.repo.content.metadata; +import java.io.Serializable; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.alfresco.model.ContentModel; +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; +import org.alfresco.service.namespace.QName; + /** * @see OfficeMetadataExtracter @@ -9,6 +20,11 @@ package org.alfresco.repo.content.metadata; public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest { private OfficeMetadataExtracter extracter; + + private static final QName WORD_COUNT_TEST_PROPERTY = + QName.createQName("WordCountTest"); + private static final QName LAST_AUTHOR_TEST_PROPERTY = + QName.createQName("LastAuthorTest"); @Override public void setUp() throws Exception @@ -17,6 +33,22 @@ public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest extracter = new OfficeMetadataExtracter(); extracter.setDictionaryService(dictionaryService); extracter.register(); + + // Attach a couple of extra mappings + // These will be tested later + HashMap> newMap = new HashMap>( + extracter.getMapping() + ); + + Set wcSet = new HashSet(); + wcSet.add(WORD_COUNT_TEST_PROPERTY); + newMap.put( OfficeMetadataExtracter.KEY_WORD_COUNT, wcSet ); + + Set laSet = new HashSet(); + laSet.add(LAST_AUTHOR_TEST_PROPERTY); + newMap.put( OfficeMetadataExtracter.KEY_LAST_AUTHOR, laSet ); + + extracter.setMapping(newMap); } /** @@ -46,4 +78,78 @@ public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest testExtractFromMimetype(mimetype); } } + + /** + * We support all sorts of extra metadata. Check it all behaves. + */ + public void testFileSpecificMetadata(String mimetype, Map properties) { + // Test the ones with a core alfresco mapping + if(mimetype.equals(MimetypeMap.MIMETYPE_WORD)) { + assertEquals( + "Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, + "2005-05-26T13:57:00.000+01:00", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED))); + assertEquals( + "Property " + ContentModel.PROP_MODIFIED + " not found for mimetype " + mimetype, + "2005-09-20T18:25:00.000+01:00", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_MODIFIED))); + } else if(mimetype.equals(MimetypeMap.MIMETYPE_EXCEL)) { + assertEquals( + "Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, + "1996-10-15T00:33:28.000+01:00", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED))); + assertEquals( + "Property " + ContentModel.PROP_MODIFIED + " not found for mimetype " + mimetype, + "2005-09-20T19:22:32.000+01:00", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_MODIFIED))); + } else if(mimetype.equals(MimetypeMap.MIMETYPE_PPT)) { + assertEquals( + "Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, + "1601-01-01T00:00:00.000Z", // Seriously, that's what the file says! + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED))); + assertEquals( + "Property " + ContentModel.PROP_MODIFIED + " not found for mimetype " + mimetype, + "2005-09-20T19:23:41.000+01:00", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_MODIFIED))); + } + + // Now check the non-standard ones we added in at test time + assertTrue( + "Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype, + properties.containsKey(WORD_COUNT_TEST_PROPERTY) + ); + assertTrue( + "Test Property " + LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype, + properties.containsKey(LAST_AUTHOR_TEST_PROPERTY) + ); + + if(mimetype.equals(MimetypeMap.MIMETYPE_WORD)) { + assertEquals( + "Test Property " + WORD_COUNT_TEST_PROPERTY + " incorrect for mimetype " + mimetype, + "9", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(WORD_COUNT_TEST_PROPERTY))); + assertEquals( + "Test Property " + LAST_AUTHOR_TEST_PROPERTY + " incorrect for mimetype " + mimetype, + AbstractMetadataExtracterTest.QUICK_PREVIOUS_AUTHOR, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(LAST_AUTHOR_TEST_PROPERTY))); + } else if(mimetype.equals(MimetypeMap.MIMETYPE_EXCEL)) { + assertEquals( + "Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype, + "0", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(WORD_COUNT_TEST_PROPERTY))); + assertEquals( + "Test Property " + LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype, + AbstractMetadataExtracterTest.QUICK_PREVIOUS_AUTHOR, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(LAST_AUTHOR_TEST_PROPERTY))); + } else if(mimetype.equals(MimetypeMap.MIMETYPE_PPT)) { + assertEquals( + "Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype, + "9", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(WORD_COUNT_TEST_PROPERTY))); + assertEquals( + "Test Property " + LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype, + AbstractMetadataExtracterTest.QUICK_PREVIOUS_AUTHOR, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(LAST_AUTHOR_TEST_PROPERTY))); + } + } } diff --git a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java index bc6d78dd8a..29d23eabef 100644 --- a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java @@ -60,6 +60,10 @@ import com.catcode.odf.OpenDocumentMetadata; * All user properties * * + * TIKA Note - not all of the metadata is currently + * extracted. Will probably need to add some more + * support to TIKA. + * * @author Antti Jokipii * @author Derek Hulley */ diff --git a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracterTest.java index a2760b0845..a30cf1c678 100644 --- a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracterTest.java @@ -1,5 +1,12 @@ package org.alfresco.repo.content.metadata; +import java.io.Serializable; +import java.util.Map; + +import org.alfresco.model.ContentModel; +import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; +import org.alfresco.service.namespace.QName; + /** * @see OpenDocumentMetadataExtracter @@ -46,4 +53,25 @@ public class OpenDocumentMetadataExtracterTest extends AbstractMetadataExtracter testExtractFromMimetype(mimetype); } } + protected boolean skipAuthorCheck() { return true; } + + /** + * We also provide the creation date - check that + */ + protected void testFileSpecificMetadata(String mimetype, + Map properties) { + // Check for two cases + if(mimetype.equals("application/vnd.oasis.opendocument.text")) { + assertEquals( + "Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, + "2005-09-06T23:34:00.000+01:00", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED))); + } else if(mimetype.equals("application/vnd.oasis.opendocument.graphics")) { + assertEquals( + "Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, + "2006-01-27T11:46:11.000Z", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED))); + } + } + } diff --git a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java index 3bd70b27e7..05313616e3 100644 --- a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java @@ -39,6 +39,9 @@ import org.springframework.extensions.surf.util.PropertyCheck; * description: -- cm:description * * + * TIKA Note - this probably won't be ported to TIKA. There's currently + * no support for these old formats in tika. + * * @author Jesper Steen Møller */ public class OpenOfficeMetadataExtracter extends AbstractMappingMetadataExtracter implements OpenOfficeMetadataWorker diff --git a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java index cb48728bcc..bd3f33bab8 100644 --- a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java @@ -22,8 +22,17 @@ * http://www.alfresco.com/legal/licensing" */ package org.alfresco.repo.content.metadata; +import java.io.Serializable; +import java.util.Map; + +import org.alfresco.service.namespace.QName; + /** + * Note - this test can sometimes fail if run on its own, as there + * can be a race condition with the OO process. Try running it as + * part of a suite if so, that normally seems to fix it! + * * @author Jesper Steen Møller */ public class OpenOfficeMetadataExtracterTest extends AbstractMetadataExtracterTest @@ -77,4 +86,18 @@ public class OpenOfficeMetadataExtracterTest extends AbstractMetadataExtracterTe testExtractFromMimetype(mimetype); } } + + /** + * Only run the check if we have a connection + * to an OpenOffice instance + */ + protected void testCommonMetadata(String mimetype, + Map properties) { + if(extracter.isConnected()) { + super.testCommonMetadata(mimetype, properties); + } + } + + /** Extractor only does the usual basic three properties */ + public void testFileSpecificMetadata(String mimetype, Map properties) {} } diff --git a/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java index 7bbecaee14..1483380b45 100644 --- a/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java @@ -50,6 +50,9 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation; * created: -- cm:created * * + * TIKA Note - all the fields (plus a few others) are present + * in the tika metadata. + * * @author Jesper Steen Møller * @author Derek Hulley */ @@ -95,6 +98,10 @@ public class PdfBoxMetadataExtracter extends AbstractMappingMetadataExtracter Calendar created = docInfo.getCreationDate(); if (created != null) { + // Work around https://issues.apache.org/jira/browse/PDFBOX-598 + created.set(Calendar.MILLISECOND, 0); + + // Save putRawValue(KEY_CREATED, created.getTime(), rawProperties); } } diff --git a/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java index 0d1329c63c..9abdeffc27 100644 --- a/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java @@ -1,6 +1,14 @@ package org.alfresco.repo.content.metadata; +import java.io.Serializable; +import java.util.Calendar; +import java.util.Map; + +import org.alfresco.model.ContentModel; import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; +import org.alfresco.service.namespace.QName; +import org.apache.pdfbox.util.DateConverter; /** * @see org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter @@ -41,4 +49,29 @@ public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest { testExtractFromMimetype(MimetypeMap.MIMETYPE_PDF); } + + /** + * We can also return a created date + */ + protected void testFileSpecificMetadata(String mimetype, + Map properties) { + assertEquals( + "Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, + "2005-05-26T20:52:58.000+01:00", + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED))); + } + + /** + * Test that will show when the workaround is in place. + */ + public void testDateConversion() throws Exception { + Calendar c = DateConverter.toCalendar("D:20050526205258+01'00'"); + assertEquals(2005, c.get(Calendar.YEAR)); + assertEquals(05-1, c.get(Calendar.MONTH)); + assertEquals(26, c.get(Calendar.DAY_OF_MONTH)); + assertEquals(20, c.get(Calendar.HOUR_OF_DAY)); + assertEquals(52, c.get(Calendar.MINUTE)); + assertEquals(58, c.get(Calendar.SECOND)); + //assertEquals(0, c.get(Calendar.MILLISECOND)); + } } diff --git a/source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracter.java index 36ed2b6909..5ec75083d4 100755 --- a/source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracter.java @@ -45,29 +45,33 @@ import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.namespace.QName; /** - * Metadata extractor for the PDF documents. + * Metadata extractor for RFC822 mime emails. *
- *   messageFrom:              --      imap:messageFrom
+ *   messageFrom:              --      imap:messageFrom, cm:originator
  *   messageTo:                --      imap:messageTo
  *   messageCc:                --      imap:messageCc
- *   messageSubject:           --      imap:messageSubject, cm:title, cm:description
- *   messageSent:              --      imap:dateSent
+ *   messageSubject:           --      imap:messageSubject, cm:title, cm:description, cm:subjectline
+ *   messageSent:              --      imap:dateSent, cm:sentdate
  *   All {@link Header#getName() header names}:
  *      Thread-Index:          --      imap:threadIndex
  *      Message-ID:            --      imap:messageId
  *      date:                  --      imap:dateReceived
  * 
+ * TIKA Note - to and cc are missing, and date stuff isn't
+ *  great. Thread index is missing, and arbitrary headers
+ *  don't seem to be supported
+ * 
  * @author Derek Hulley
  * @since 3.2
  */
 public class RFC822MetadataExtracter extends AbstractMappingMetadataExtracter
 {
 
-    private static final String KEY_MESSAGE_FROM = "messageFrom";
-    private static final String KEY_MESSAGE_TO = "messageTo";
-    private static final String KEY_MESSAGE_CC = "messageCc";
-    private static final String KEY_MESSAGE_SUBJECT = "messageSubject";
-    private static final String KEY_MESSAGE_SENT = "messageSent";
+    protected static final String KEY_MESSAGE_FROM = "messageFrom";
+    protected static final String KEY_MESSAGE_TO = "messageTo";
+    protected static final String KEY_MESSAGE_CC = "messageCc";
+    protected static final String KEY_MESSAGE_SUBJECT = "messageSubject";
+    protected static final String KEY_MESSAGE_SENT = "messageSent";
 
     public static String[] SUPPORTED_MIMETYPES = new String[] { MimetypeMap.MIMETYPE_RFC822 };
 
diff --git a/source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracter.properties b/source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracter.properties
index e8e4e9b401..4291afc3e5 100755
--- a/source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracter.properties
+++ b/source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracter.properties
@@ -9,14 +9,14 @@ namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
 # Mappings
 
 #Default values that doesn't match to Header
-messageFrom=imap:messageFrom
+messageFrom=imap:messageFrom, cm:originator
 messageTo=imap:messageTo
 messageCc=imap:messageCc
-messageSubject=imap:messageSubject, cm:title, cm:description
-messageSent=imap:dateSent
+messageSubject=imap:messageSubject, cm:title, cm:description, cm:subjectline
+messageSent=imap:dateSent, cm:sentdate
 
 
 #Add here any values you want to extract. Use Header name for key.
 Thread-Index=imap:threadIndex
 Message-ID=imap:messageId
-Date=imap:dateReceived
\ No newline at end of file
+Date=imap:dateReceived
diff --git a/source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracterTest.java
new file mode 100644
index 0000000000..924bb5bbbc
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/metadata/RFC822MetadataExtracterTest.java
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2005 Jesper Steen Møller
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+ * As a special exception to the terms and conditions of version 2.0 of 
+ * the GPL, you may redistribute this Program in connection with Free/Libre 
+ * and Open Source Software ("FLOSS") applications as described in Alfresco's 
+ * FLOSS exception.  You should have recieved a copy of the text describing 
+ * the FLOSS exception, and it is also available here: 
+ * http://www.alfresco.com/legal/licensing"
+ */
+package org.alfresco.repo.content.metadata;
+
+import java.io.Serializable;
+import java.lang.reflect.Field;
+import java.text.DateFormat;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.alfresco.model.ContentModel;
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
+import org.alfresco.service.namespace.QName;
+
+/**
+ * Test for the RFC822 (imap/mbox) extractor
+ */
+public class RFC822MetadataExtracterTest extends AbstractMetadataExtracterTest
+{
+    private RFC822MetadataExtracter extracter;
+    
+    private static final QName MESSAGE_FROM_TEST_PROPERTY = 
+             QName.createQName("MessageToTest");
+    private static final QName MESSAGE_TO_TEST_PROPERTY = 
+             QName.createQName("MessageFromTest");
+    private static final QName MESSAGE_CC_TEST_PROPERTY = 
+       QName.createQName("MessageCCTest");
+
+    @Override
+    public void setUp() throws Exception
+    {
+        super.setUp();
+        
+        // Ask Spring for the extractor, so it
+        //  gets its date formats populated
+        extracter = (RFC822MetadataExtracter)ctx.getBean("extracter.RFC822");
+        
+        // Attach a couple of extra mappings
+        // These will be tested later
+        HashMap> newMap = new HashMap>(
+              extracter.getMapping()
+        );
+        
+        Set fromSet = new HashSet();
+        fromSet.add(MESSAGE_FROM_TEST_PROPERTY);
+        fromSet.addAll( extracter.getCurrentMapping().get(RFC822MetadataExtracter.KEY_MESSAGE_FROM) );
+        newMap.put( RFC822MetadataExtracter.KEY_MESSAGE_FROM, fromSet );
+        
+        Set toSet = new HashSet();
+        toSet.add(MESSAGE_TO_TEST_PROPERTY);
+        toSet.addAll( extracter.getCurrentMapping().get(RFC822MetadataExtracter.KEY_MESSAGE_TO) );
+        newMap.put( RFC822MetadataExtracter.KEY_MESSAGE_TO, toSet );
+        
+        Set ccSet = new HashSet();
+        ccSet.add(MESSAGE_CC_TEST_PROPERTY);
+        ccSet.addAll( extracter.getCurrentMapping().get(RFC822MetadataExtracter.KEY_MESSAGE_CC) );
+        newMap.put( RFC822MetadataExtracter.KEY_MESSAGE_CC, ccSet );
+        
+        extracter.setMapping(newMap);
+    }
+
+    /**
+     * @return Returns the same transformer regardless - it is allowed
+     */
+    protected MetadataExtracter getExtracter()
+    {
+        return extracter;
+    }
+
+    /**
+     * RFC822 has a non-standard date format.
+     * Check that this was sprung-in - if not, then
+     *  other tests will fail!
+     */
+    public void testHasDateFormats() throws Exception {
+       Set supportedDateFormats;
+       
+       Field sdf = RFC822MetadataExtracter.class.getSuperclass().
+                         getDeclaredField("supportedDateFormats");
+       sdf.setAccessible(true);
+       supportedDateFormats = (Set)sdf.get(extracter);
+       
+       if(supportedDateFormats.size() == 0) {
+          fail("No supportedDateFormats injected into RFC822MetadataExtracter - " +
+          		"spring setup broken and date parsing will break all of the extraction process");
+       }
+    }
+    
+    public void testSupports() throws Exception
+    {
+        for (String mimetype : RFC822MetadataExtracter.SUPPORTED_MIMETYPES)
+        {
+            boolean supports = extracter.isSupported(mimetype);
+            assertTrue("Mimetype should be supported: " + mimetype, supports);
+        }
+    }
+
+    public void testEmailExtraction() throws Exception
+    {
+        testExtractFromMimetype(MimetypeMap.MIMETYPE_RFC822);
+    }
+
+    /**
+     * We have no author, and have the same title and description
+     */
+    protected void testCommonMetadata(String mimetype,
+         Map properties) {
+       assertEquals(
+             "Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
+             QUICK_TITLE,
+             DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE)));
+       assertEquals(
+             "Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype,
+             QUICK_TITLE,
+             DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
+    }
+
+   /** 
+     * Test our extra IMAP properties 
+     */
+    public void testFileSpecificMetadata(String mimetype, Map properties) {
+       // Check the other cm: ones
+       assertEquals(
+             "Property " + ContentModel.PROP_ORIGINATOR + " not found for mimetype " + mimetype,
+             QUICK_CREATOR + " <" + QUICK_CREATOR_EMAIL + ">",
+             DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_ORIGINATOR)));
+       assertEquals(
+             "Property " + ContentModel.PROP_SENTDATE + " not found for mimetype " + mimetype,
+             "2004-06-04T13:23:22.000+01:00",
+             DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_SENTDATE)));
+       
+       // Check some imap: ones
+       assertEquals(
+             "Test Property " + MESSAGE_FROM_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
+             "Nevin Nollop ",
+             DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_FROM_TEST_PROPERTY)));
+       assertEquals(
+             "Test Property " + MESSAGE_FROM_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
+             "Nevin Nollop ",
+             DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_FROM_TEST_PROPERTY)));
+       assertEquals(
+             "Test Property " + MESSAGE_TO_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
+             "Nevin Nollop ",
+             DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_TO_TEST_PROPERTY)));
+       
+       // Finally check our non-standard ones we added in at test time
+       assertTrue( 
+             "Test Property " + MESSAGE_FROM_TEST_PROPERTY + " not found for mimetype " + mimetype,
+             properties.containsKey(MESSAGE_FROM_TEST_PROPERTY)
+       );
+       assertTrue( 
+             "Test Property " + MESSAGE_TO_TEST_PROPERTY + " not found for mimetype " + mimetype,
+             properties.containsKey(MESSAGE_TO_TEST_PROPERTY)
+       );
+       assertTrue( 
+             "Test Property " + MESSAGE_CC_TEST_PROPERTY + " not found for mimetype " + mimetype,
+             properties.containsKey(MESSAGE_CC_TEST_PROPERTY)
+       );
+       
+       assertEquals(
+             "Test Property " + MESSAGE_FROM_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
+             "Nevin Nollop ",
+             DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_FROM_TEST_PROPERTY)));
+       assertEquals(
+             "Test Property " + MESSAGE_TO_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
+             "Nevin Nollop ",
+             DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_TO_TEST_PROPERTY)));
+       assertEquals(
+             "Test Property " + MESSAGE_CC_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
+             "Nevin Nollop ",
+             DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_CC_TEST_PROPERTY)));
+    }
+}