mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Update metadata extractors - Outlook, MP3, Mail and PDF improvements, and increase test coverage
git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@18454 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -787,6 +787,12 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
||||
propertyTypeDef,
|
||||
(Collection) propertyValue);
|
||||
}
|
||||
else if (propertyValue instanceof Object[])
|
||||
{
|
||||
convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert(
|
||||
propertyTypeDef,
|
||||
(Object[]) propertyValue);
|
||||
}
|
||||
else
|
||||
{
|
||||
convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert(
|
||||
|
@@ -56,11 +56,16 @@ import org.springframework.context.ApplicationContext;
|
||||
*/
|
||||
public abstract class AbstractMetadataExtracterTest extends TestCase
|
||||
{
|
||||
static {
|
||||
ApplicationContextHelper.setUseLazyLoading(false);
|
||||
}
|
||||
protected static ApplicationContext ctx = ApplicationContextHelper.getApplicationContext();
|
||||
|
||||
protected static final String QUICK_TITLE = "The quick brown fox jumps over the lazy dog";
|
||||
protected static final String QUICK_DESCRIPTION = "Gym class featuring a brown fox and lazy dog";
|
||||
protected static final String QUICK_CREATOR = "Nevin Nollop";
|
||||
protected static final String QUICK_CREATOR_EMAIL = "nevin.nollop@alfresco.com";
|
||||
protected static final String QUICK_PREVIOUS_AUTHOR = "Derek Hulley";
|
||||
|
||||
protected MimetypeMap mimetypeMap;
|
||||
protected DictionaryService dictionaryService;
|
||||
@@ -97,12 +102,16 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
|
||||
try
|
||||
{
|
||||
Map<QName, Serializable> properties = extractFromMimetype(mimetype);
|
||||
// check
|
||||
// check common metadata
|
||||
testCommonMetadata(mimetype, properties);
|
||||
// check file-type specific metadata
|
||||
testFileSpecificMetadata(mimetype, properties);
|
||||
}
|
||||
catch (FileNotFoundException e)
|
||||
{
|
||||
// The test file is not there. We won't fail it.
|
||||
System.err.println("No test file found for mime type " + mimetype +
|
||||
", skipping extraction test - " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,17 +136,46 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
|
||||
return properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we can get the common metadata correctly
|
||||
* from the file.
|
||||
* You only need to override this if your test data file
|
||||
* doesn't have the usual Nevin Nollop/quick brown fox
|
||||
* data in it.
|
||||
*/
|
||||
protected void testCommonMetadata(String mimetype, Map<QName, Serializable> properties)
|
||||
{
|
||||
assertEquals(
|
||||
// One of Creator or Author
|
||||
if(!skipAuthorCheck()) {
|
||||
if(properties.containsKey(ContentModel.PROP_CREATOR)) {
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_CREATOR + " not found for mimetype " + mimetype,
|
||||
QUICK_CREATOR,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATOR)));
|
||||
} else if(properties.containsKey(ContentModel.PROP_AUTHOR)) {
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
|
||||
QUICK_CREATOR,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
|
||||
} else {
|
||||
fail("Expected on Property out of " + ContentModel.PROP_CREATOR + " and " +
|
||||
ContentModel.PROP_AUTHOR + " but found neither of them.");
|
||||
}
|
||||
}
|
||||
|
||||
// Title and description
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
|
||||
QUICK_TITLE,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE)));
|
||||
assertEquals(
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype,
|
||||
QUICK_DESCRIPTION,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
|
||||
}
|
||||
protected abstract void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties);
|
||||
protected boolean skipAuthorCheck() { return false; }
|
||||
|
||||
|
||||
public void testZeroLengthFile() throws Exception
|
||||
{
|
||||
@@ -163,4 +201,15 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
|
||||
assertEquals("There should not be any new properties", 0, properties.size());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected void assertContains(String message, String needle, String haystack) {
|
||||
if(haystack.indexOf(needle) > -1) {
|
||||
return;
|
||||
}
|
||||
fail(message);
|
||||
}
|
||||
protected void assertContains(String needle, String haystack) {
|
||||
assertContains("'" + needle + "' wasn't found in '" + haystack + "'", needle, haystack);
|
||||
}
|
||||
}
|
||||
|
@@ -49,6 +49,10 @@ import org.alfresco.service.cmr.repository.ContentReader;
|
||||
* <b>description:</b> -- cm:description
|
||||
* </pre>
|
||||
*
|
||||
* TIKA note - all metadata will be present, but will need to
|
||||
* search for the varient names ourselves as tika puts them
|
||||
* in as-is.
|
||||
*
|
||||
* @author Jesper Steen Møller
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
|
@@ -24,7 +24,11 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
/**
|
||||
* @author Jesper Steen Møller
|
||||
@@ -63,4 +67,7 @@ public class HtmlMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
testExtractFromMimetype(MimetypeMap.MIMETYPE_HTML);
|
||||
}
|
||||
|
||||
/** Extractor only does the usual basic three properties */
|
||||
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {}
|
||||
}
|
||||
|
@@ -57,6 +57,10 @@ import org.farng.mp3.lyrics3.Lyrics3v2Field;
|
||||
* <b>lyrics:</b> -- {music}lyrics
|
||||
* </pre>
|
||||
*
|
||||
* TIKA Note - title and author go in metadata, but much of the
|
||||
* rest is only in the text. Some of the ID3v2 parts
|
||||
* (composer, lyrics) are not yet implemented.
|
||||
*
|
||||
* @author Roy Wetherall
|
||||
*/
|
||||
public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
@@ -91,7 +95,8 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
reader.getContent(tempFile);
|
||||
|
||||
// Create the MP3 object from the file
|
||||
MP3File mp3File = new MP3File(tempFile);
|
||||
// Open it read only as we won't make any changes
|
||||
MP3File mp3File = new MP3File(tempFile, false);
|
||||
|
||||
ID3v1 id3v1 = mp3File.getID3v1Tag();
|
||||
if (id3v1 != null)
|
||||
@@ -141,6 +146,24 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
catch(Exception e)
|
||||
{
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug(
|
||||
"MP3 Metadata extraction failed: \n" +
|
||||
" Content: " + reader,
|
||||
e);
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.warn(
|
||||
"MP3 Metadata extraction failed (turn on DEBUG for full error): \n" +
|
||||
" Content: " + reader + "\n" +
|
||||
" Failure: " + e.getMessage());
|
||||
}
|
||||
|
||||
}
|
||||
finally
|
||||
{
|
||||
@@ -167,16 +190,22 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
private String getDescription(Map<String, Serializable> props)
|
||||
{
|
||||
StringBuilder result = new StringBuilder();
|
||||
if (props.get(KEY_SONG_TITLE) != null && props.get(KEY_ARTIST) != null && props.get(KEY_ALBUM_TITLE) != null)
|
||||
if (props.get(KEY_SONG_TITLE) != null)
|
||||
{
|
||||
result
|
||||
.append(props.get(KEY_SONG_TITLE))
|
||||
result.append(props.get(KEY_SONG_TITLE));
|
||||
if (props.get(KEY_ALBUM_TITLE) != null)
|
||||
{
|
||||
result
|
||||
.append(" - ")
|
||||
.append(props.get(KEY_ALBUM_TITLE))
|
||||
.append(props.get(KEY_ALBUM_TITLE));
|
||||
}
|
||||
if (props.get(KEY_ARTIST) != null)
|
||||
{
|
||||
result
|
||||
.append(" (")
|
||||
.append(props.get(KEY_ARTIST))
|
||||
.append(")");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
|
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen Møller
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
* As a special exception to the terms and conditions of version 2.0 of
|
||||
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||
* FLOSS exception. You should have recieved a copy of the text describing
|
||||
* the FLOSS exception, and it is also available here:
|
||||
* http://www.alfresco.com/legal/licensing"
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
/**
|
||||
* Test for the MP3 metadata extraction from id3 tags.
|
||||
*/
|
||||
public class MP3MetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private MP3MetadataExtracter extracter;
|
||||
private static final String ARTIST = "Hauskaz";
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception
|
||||
{
|
||||
super.setUp();
|
||||
extracter = new MP3MetadataExtracter();
|
||||
extracter.setDictionaryService(dictionaryService);
|
||||
extracter.register();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the same transformer regardless - it is allowed
|
||||
*/
|
||||
protected MetadataExtracter getExtracter()
|
||||
{
|
||||
return extracter;
|
||||
}
|
||||
|
||||
public void testSupports() throws Exception
|
||||
{
|
||||
for (String mimetype : MP3MetadataExtracter.SUPPORTED_MIMETYPES)
|
||||
{
|
||||
boolean supports = extracter.isSupported(mimetype);
|
||||
assertTrue("Mimetype should be supported: " + mimetype, supports);
|
||||
}
|
||||
}
|
||||
|
||||
public void testMP3Extraction() throws Exception
|
||||
{
|
||||
testExtractFromMimetype(MimetypeMap.MIMETYPE_MP3);
|
||||
}
|
||||
|
||||
/**
|
||||
* We don't have quite the usual metadata. Tests the descriptions one.
|
||||
* Other tests in {@link #testFileSpecificMetadata(String, Map)}
|
||||
*/
|
||||
protected void testCommonMetadata(String mimetype, Map<QName, Serializable> properties) {
|
||||
// Title is as normal
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
|
||||
QUICK_TITLE,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE)));
|
||||
// Has Author, not Creator, and is different
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
|
||||
"Hauskaz",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
|
||||
|
||||
// Description is a composite
|
||||
assertContains(
|
||||
"Property " + ContentModel.PROP_DESCRIPTION + " didn't contain " + QUICK_TITLE + " for mimetype " + mimetype,
|
||||
QUICK_TITLE,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
|
||||
// Check rest of it later
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests for various MP3 specific bits of metadata
|
||||
*/
|
||||
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {
|
||||
QName songTitle = QName.createQName("music","songTitle");
|
||||
assertEquals(
|
||||
"Property " + songTitle + " not found for mimetype " + mimetype,
|
||||
QUICK_TITLE,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(songTitle)));
|
||||
|
||||
QName songArtist = QName.createQName("music","artist");
|
||||
assertEquals(
|
||||
"Property " + songArtist + " not found for mimetype " + mimetype,
|
||||
ARTIST,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(songArtist)));
|
||||
|
||||
// Description is a composite - check the artist part
|
||||
assertContains(
|
||||
"Property " + ContentModel.PROP_DESCRIPTION + " didn't contain " + ARTIST + " for mimetype " + mimetype,
|
||||
ARTIST,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
|
||||
}
|
||||
}
|
@@ -27,20 +27,13 @@ package org.alfresco.repo.content.metadata;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Calendar;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
|
||||
import org.apache.poi.poifs.filesystem.DocumentInputStream;
|
||||
import org.apache.poi.hsmf.MAPIMessage;
|
||||
|
||||
/**
|
||||
* Outlook format email meta-data extractor extracting the following values:
|
||||
@@ -52,6 +45,9 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
|
||||
* <b>subjectLine:</b> -- cm:subjectline, cm:description
|
||||
* </pre>
|
||||
*
|
||||
* TIKA note - to/cc/bcc go into the html part, not the metadata.
|
||||
* Also, email addresses not included as yet.
|
||||
*
|
||||
* @since 2.1
|
||||
* @author Kevin Roast
|
||||
*/
|
||||
@@ -65,12 +61,6 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
|
||||
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_OUTLOOK_MSG};
|
||||
|
||||
private static final String STREAM_PREFIX = "__substg1.0_";
|
||||
private static final int STREAM_PREFIX_LENGTH = STREAM_PREFIX.length();
|
||||
|
||||
// the CC: email addresses
|
||||
private ThreadLocal<List<String>> receipientEmails = new ThreadLocal<List<String>>();
|
||||
|
||||
public MailMetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
|
||||
@@ -81,37 +71,25 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
{
|
||||
final Map<String, Serializable> rawProperties = newRawMap();
|
||||
|
||||
POIFSReaderListener readerListener = new POIFSReaderListener()
|
||||
{
|
||||
public void processPOIFSReaderEvent(final POIFSReaderEvent event)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (event.getName().startsWith(STREAM_PREFIX))
|
||||
{
|
||||
StreamHandler handler = new StreamHandler(event.getName(), event.getStream());
|
||||
handler.process(rawProperties);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
this.receipientEmails.set(new ArrayList<String>());
|
||||
|
||||
is = reader.getContentInputStream();
|
||||
POIFSReader poiFSReader = new POIFSReader();
|
||||
poiFSReader.registerListener(readerListener);
|
||||
MAPIMessage msg;
|
||||
|
||||
try
|
||||
{
|
||||
poiFSReader.read(is);
|
||||
msg = new MAPIMessage(is);
|
||||
msg.setReturnNullOnMissingChunk(true);
|
||||
|
||||
putRawValue(KEY_ORIGINATOR, msg.getDisplayFrom(), rawProperties);
|
||||
putRawValue(KEY_SUBJECT, msg.getSubject(), rawProperties);
|
||||
putRawValue(KEY_SENT_DATE, msg.getMessageDate().getTime(), rawProperties);
|
||||
|
||||
// Store the TO, but not cc/bcc in the addressee field
|
||||
putRawValue(KEY_ADDRESSEE, msg.getDisplayTo(), rawProperties);
|
||||
// But store all email addresses (to/cc/bcc) in the addresses field
|
||||
putRawValue(KEY_ADDRESSEES, msg.getRecipientEmailAddressList(), rawProperties);
|
||||
}
|
||||
catch (IOException err)
|
||||
{
|
||||
@@ -119,12 +97,6 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
if (logger.isWarnEnabled())
|
||||
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
|
||||
}
|
||||
|
||||
// store multi-value extracted property
|
||||
if (this.receipientEmails.get().size() != 0)
|
||||
{
|
||||
putRawValue(KEY_ADDRESSEES, (Serializable)receipientEmails.get(), rawProperties);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
@@ -136,162 +108,4 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
// Done
|
||||
return rawProperties;
|
||||
}
|
||||
|
||||
private static String convertExchangeAddress(String email)
|
||||
{
|
||||
if (email.lastIndexOf("/CN=") == -1)
|
||||
{
|
||||
return email;
|
||||
}
|
||||
else
|
||||
{
|
||||
// found a full Exchange format To header
|
||||
return email.substring(email.lastIndexOf("/CN=") + 4);
|
||||
}
|
||||
}
|
||||
|
||||
private static final String ENCODING_TEXT = "001E";
|
||||
private static final String ENCODING_BINARY = "0102";
|
||||
private static final String ENCODING_UNICODE = "001F";
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
private static final String SUBSTG_MESSAGEBODY = "1000";
|
||||
private static final String SUBSTG_RECIPIENTEMAIL = "39FE"; // 7bit email address
|
||||
private static final String SUBSTG_RECIPIENTSEARCH = "300B"; // address 'search' variant
|
||||
private static final String SUBSTG_RECEIVEDEMAIL = "0076";
|
||||
private static final String SUBSTG_SENDEREMAIL = "0C1F";
|
||||
private static final String SUBSTG_DATE = "0047";
|
||||
private static final String SUBSTG_SUBJECT = "0037";
|
||||
|
||||
/**
|
||||
* Class to handle stream types. Can process and extract specific streams.
|
||||
*/
|
||||
private class StreamHandler
|
||||
{
|
||||
StreamHandler(String name, DocumentInputStream stream)
|
||||
{
|
||||
this.type = name.substring(STREAM_PREFIX_LENGTH, STREAM_PREFIX_LENGTH + 4);
|
||||
this.encoding = name.substring(STREAM_PREFIX_LENGTH + 4, STREAM_PREFIX_LENGTH + 8);
|
||||
this.stream = stream;
|
||||
}
|
||||
|
||||
void process(final Map<String, Serializable> destination)
|
||||
throws IOException
|
||||
{
|
||||
if (type.equals(SUBSTG_SENDEREMAIL))
|
||||
{
|
||||
putRawValue(KEY_ORIGINATOR, convertExchangeAddress(extractText()), destination);
|
||||
}
|
||||
else if (type.equals(SUBSTG_RECIPIENTEMAIL))
|
||||
{
|
||||
receipientEmails.get().add(convertExchangeAddress(extractText()));
|
||||
}
|
||||
else if (type.equals(SUBSTG_RECIPIENTSEARCH))
|
||||
{
|
||||
String email = extractText(ENCODING_TEXT);
|
||||
int smptIndex = email.indexOf("SMTP:");
|
||||
if (smptIndex != -1)
|
||||
{
|
||||
/* also may be used for SUBSTG_RECIPIENTTRANSPORT = "5FF7";
|
||||
with search for SMPT followed by a null char */
|
||||
|
||||
// this is a secondary mechanism for encoding a receipient email address
|
||||
// the 7 bit email address may not have been set by Outlook - so this is needed instead
|
||||
// handle null character at end of string
|
||||
int endIndex = email.length();
|
||||
if (email.codePointAt(email.length() - 1) == 0)
|
||||
{
|
||||
endIndex--;
|
||||
}
|
||||
email = email.substring(smptIndex + 5, endIndex);
|
||||
receipientEmails.get().add(email);
|
||||
}
|
||||
}
|
||||
else if (type.equals(SUBSTG_RECEIVEDEMAIL))
|
||||
{
|
||||
putRawValue(KEY_ADDRESSEE, convertExchangeAddress(extractText()), destination);
|
||||
}
|
||||
else if (type.equals(SUBSTG_SUBJECT))
|
||||
{
|
||||
putRawValue(KEY_SUBJECT, extractText(), destination);
|
||||
}
|
||||
else if (type.equals(SUBSTG_DATE))
|
||||
{
|
||||
// the date is not "really" plain text - but it's appropriate to parse as such
|
||||
String date = extractText(ENCODING_TEXT);
|
||||
int valueIndex = date.indexOf("l=");
|
||||
if (valueIndex != -1)
|
||||
{
|
||||
int dateIndex = date.indexOf('-', valueIndex);
|
||||
if (dateIndex != -1)
|
||||
{
|
||||
dateIndex++;
|
||||
final Calendar c = Calendar.getInstance();
|
||||
String strYear = date.substring(dateIndex, dateIndex + 2);
|
||||
c.set(Calendar.YEAR, Integer.parseInt(strYear) + (2000 - 1900));
|
||||
String strMonth = date.substring(dateIndex + 2, dateIndex + 4);
|
||||
c.set(Calendar.MONTH, Integer.parseInt(strMonth) - 1);
|
||||
String strDay = date.substring(dateIndex + 4, dateIndex + 6);
|
||||
c.set(Calendar.DAY_OF_MONTH, Integer.parseInt(strDay));
|
||||
String strHour = date.substring(dateIndex + 6, dateIndex + 8);
|
||||
c.set(Calendar.HOUR, Integer.parseInt(strHour));
|
||||
String strMinute = date.substring(dateIndex + 10, dateIndex + 12);
|
||||
c.set(Calendar.MINUTE, Integer.parseInt(strMinute));
|
||||
c.set(Calendar.SECOND, 0);
|
||||
putRawValue(KEY_SENT_DATE, c.getTime(), destination);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the text from the stream based on the encoding
|
||||
*
|
||||
* @return String
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
private String extractText()
|
||||
throws IOException
|
||||
{
|
||||
return extractText(this.encoding);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the text from the stream based on the encoding
|
||||
*
|
||||
* @return String
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
private String extractText(String encoding)
|
||||
throws IOException
|
||||
{
|
||||
byte[] data = new byte[stream.available()];
|
||||
stream.read(data);
|
||||
|
||||
if (encoding.equals(ENCODING_TEXT) || encoding.equals(ENCODING_BINARY))
|
||||
{
|
||||
return new String(data);
|
||||
}
|
||||
else if (encoding.equals(ENCODING_UNICODE))
|
||||
{
|
||||
// convert double-byte encoding to single byte for String conversion
|
||||
byte[] b = new byte[data.length >> 1];
|
||||
for (int i=0; i<b.length; i++)
|
||||
{
|
||||
b[i] = data[i << 1];
|
||||
}
|
||||
return new String(b);
|
||||
}
|
||||
else
|
||||
{
|
||||
return new String(data);
|
||||
}
|
||||
}
|
||||
|
||||
private String type;
|
||||
private String encoding;
|
||||
private DocumentInputStream stream;
|
||||
}
|
||||
}
|
||||
|
@@ -25,6 +25,7 @@
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
@@ -71,15 +72,58 @@ public class MailMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
testExtractFromMimetype(MimetypeMap.MIMETYPE_OUTLOOK_MSG);
|
||||
}
|
||||
|
||||
/**
|
||||
* We have different things to normal, so
|
||||
* do our own common tests.
|
||||
*/
|
||||
protected void testCommonMetadata(String mimetype, Map<QName, Serializable> properties)
|
||||
{
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
|
||||
"KEVIN.ROAST@BEN",
|
||||
"Kevin Roast",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype,
|
||||
"Test the content transformer",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the outlook specific bits
|
||||
*/
|
||||
protected void testFileSpecificMetadata(String mimetype,
|
||||
Map<QName, Serializable> properties) {
|
||||
// Sent Date
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_SENTDATE + " not found for mimetype " + mimetype,
|
||||
"2007-06-14T09:42:55.000+01:00",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_SENTDATE)));
|
||||
|
||||
// Addressee
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_ADDRESSEE + " not found for mimetype " + mimetype,
|
||||
"Kevin Roast",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_ADDRESSEE)));
|
||||
|
||||
// Addressees
|
||||
Collection<String> addressees = (Collection<String>)properties.get(ContentModel.PROP_ADDRESSEES);
|
||||
assertTrue(
|
||||
"Property " + ContentModel.PROP_ADDRESSEES + " not found for mimetype " + mimetype,
|
||||
addressees != null
|
||||
);
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_ADDRESSEES + " wrong size for mimetype " + mimetype,
|
||||
1,
|
||||
addressees.size());
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_ADDRESSEES + " wrong content for mimetype " + mimetype,
|
||||
"kevin.roast@alfresco.org",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, addressees.iterator().next()));
|
||||
|
||||
// Subject Line
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_SUBJECT + " not found for mimetype " + mimetype,
|
||||
"Test the content transformer",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_SUBJECT)));
|
||||
}
|
||||
}
|
||||
|
@@ -62,6 +62,9 @@ import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
|
||||
* <b>wordCount:</b>
|
||||
* </pre>
|
||||
*
|
||||
* TIKA Note - everything we currently have should be present
|
||||
* in the metadata.
|
||||
*
|
||||
* @author Jesper Steen Møller
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
|
@@ -1,5 +1,16 @@
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
|
||||
/**
|
||||
* @see OfficeMetadataExtracter
|
||||
@@ -9,6 +20,11 @@ package org.alfresco.repo.content.metadata;
|
||||
public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private OfficeMetadataExtracter extracter;
|
||||
|
||||
private static final QName WORD_COUNT_TEST_PROPERTY =
|
||||
QName.createQName("WordCountTest");
|
||||
private static final QName LAST_AUTHOR_TEST_PROPERTY =
|
||||
QName.createQName("LastAuthorTest");
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception
|
||||
@@ -17,6 +33,22 @@ public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
extracter = new OfficeMetadataExtracter();
|
||||
extracter.setDictionaryService(dictionaryService);
|
||||
extracter.register();
|
||||
|
||||
// Attach a couple of extra mappings
|
||||
// These will be tested later
|
||||
HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
|
||||
extracter.getMapping()
|
||||
);
|
||||
|
||||
Set<QName> wcSet = new HashSet<QName>();
|
||||
wcSet.add(WORD_COUNT_TEST_PROPERTY);
|
||||
newMap.put( OfficeMetadataExtracter.KEY_WORD_COUNT, wcSet );
|
||||
|
||||
Set<QName> laSet = new HashSet<QName>();
|
||||
laSet.add(LAST_AUTHOR_TEST_PROPERTY);
|
||||
newMap.put( OfficeMetadataExtracter.KEY_LAST_AUTHOR, laSet );
|
||||
|
||||
extracter.setMapping(newMap);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -46,4 +78,78 @@ public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
testExtractFromMimetype(mimetype);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* We support all sorts of extra metadata. Check it all behaves.
|
||||
*/
|
||||
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {
|
||||
// Test the ones with a core alfresco mapping
|
||||
if(mimetype.equals(MimetypeMap.MIMETYPE_WORD)) {
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
|
||||
"2005-05-26T13:57:00.000+01:00",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_MODIFIED + " not found for mimetype " + mimetype,
|
||||
"2005-09-20T18:25:00.000+01:00",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_MODIFIED)));
|
||||
} else if(mimetype.equals(MimetypeMap.MIMETYPE_EXCEL)) {
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
|
||||
"1996-10-15T00:33:28.000+01:00",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_MODIFIED + " not found for mimetype " + mimetype,
|
||||
"2005-09-20T19:22:32.000+01:00",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_MODIFIED)));
|
||||
} else if(mimetype.equals(MimetypeMap.MIMETYPE_PPT)) {
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
|
||||
"1601-01-01T00:00:00.000Z", // Seriously, that's what the file says!
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_MODIFIED + " not found for mimetype " + mimetype,
|
||||
"2005-09-20T19:23:41.000+01:00",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_MODIFIED)));
|
||||
}
|
||||
|
||||
// Now check the non-standard ones we added in at test time
|
||||
assertTrue(
|
||||
"Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
properties.containsKey(WORD_COUNT_TEST_PROPERTY)
|
||||
);
|
||||
assertTrue(
|
||||
"Test Property " + LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
properties.containsKey(LAST_AUTHOR_TEST_PROPERTY)
|
||||
);
|
||||
|
||||
if(mimetype.equals(MimetypeMap.MIMETYPE_WORD)) {
|
||||
assertEquals(
|
||||
"Test Property " + WORD_COUNT_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
"9",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(WORD_COUNT_TEST_PROPERTY)));
|
||||
assertEquals(
|
||||
"Test Property " + LAST_AUTHOR_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
AbstractMetadataExtracterTest.QUICK_PREVIOUS_AUTHOR,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(LAST_AUTHOR_TEST_PROPERTY)));
|
||||
} else if(mimetype.equals(MimetypeMap.MIMETYPE_EXCEL)) {
|
||||
assertEquals(
|
||||
"Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
"0",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(WORD_COUNT_TEST_PROPERTY)));
|
||||
assertEquals(
|
||||
"Test Property " + LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
AbstractMetadataExtracterTest.QUICK_PREVIOUS_AUTHOR,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(LAST_AUTHOR_TEST_PROPERTY)));
|
||||
} else if(mimetype.equals(MimetypeMap.MIMETYPE_PPT)) {
|
||||
assertEquals(
|
||||
"Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
"9",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(WORD_COUNT_TEST_PROPERTY)));
|
||||
assertEquals(
|
||||
"Test Property " + LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
AbstractMetadataExtracterTest.QUICK_PREVIOUS_AUTHOR,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(LAST_AUTHOR_TEST_PROPERTY)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -60,6 +60,10 @@ import com.catcode.odf.OpenDocumentMetadata;
|
||||
* <b>All user properties</b>
|
||||
* </pre>
|
||||
*
|
||||
* TIKA Note - not all of the metadata is currently
|
||||
* extracted. Will probably need to add some more
|
||||
* support to TIKA.
|
||||
*
|
||||
* @author Antti Jokipii
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
|
@@ -1,5 +1,12 @@
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
|
||||
/**
|
||||
* @see OpenDocumentMetadataExtracter
|
||||
@@ -46,4 +53,25 @@ public class OpenDocumentMetadataExtracterTest extends AbstractMetadataExtracter
|
||||
testExtractFromMimetype(mimetype);
|
||||
}
|
||||
}
|
||||
protected boolean skipAuthorCheck() { return true; }
|
||||
|
||||
/**
|
||||
* We also provide the creation date - check that
|
||||
*/
|
||||
protected void testFileSpecificMetadata(String mimetype,
|
||||
Map<QName, Serializable> properties) {
|
||||
// Check for two cases
|
||||
if(mimetype.equals("application/vnd.oasis.opendocument.text")) {
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
|
||||
"2005-09-06T23:34:00.000+01:00",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
|
||||
} else if(mimetype.equals("application/vnd.oasis.opendocument.graphics")) {
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
|
||||
"2006-01-27T11:46:11.000Z",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -39,6 +39,9 @@ import org.springframework.extensions.surf.util.PropertyCheck;
|
||||
* <b>description:</b> -- cm:description
|
||||
* </pre>
|
||||
*
|
||||
* TIKA Note - this probably won't be ported to TIKA. There's currently
|
||||
* no support for these old formats in tika.
|
||||
*
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public class OpenOfficeMetadataExtracter extends AbstractMappingMetadataExtracter implements OpenOfficeMetadataWorker
|
||||
|
@@ -22,8 +22,17 @@
|
||||
* http://www.alfresco.com/legal/licensing" */
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
|
||||
/**
|
||||
* Note - this test can sometimes fail if run on its own, as there
|
||||
* can be a race condition with the OO process. Try running it as
|
||||
* part of a suite if so, that normally seems to fix it!
|
||||
*
|
||||
* @author Jesper Steen Møller
|
||||
*/
|
||||
public class OpenOfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
@@ -77,4 +86,18 @@ public class OpenOfficeMetadataExtracterTest extends AbstractMetadataExtracterTe
|
||||
testExtractFromMimetype(mimetype);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Only run the check if we have a connection
|
||||
* to an OpenOffice instance
|
||||
*/
|
||||
protected void testCommonMetadata(String mimetype,
|
||||
Map<QName, Serializable> properties) {
|
||||
if(extracter.isConnected()) {
|
||||
super.testCommonMetadata(mimetype, properties);
|
||||
}
|
||||
}
|
||||
|
||||
/** Extractor only does the usual basic three properties */
|
||||
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {}
|
||||
}
|
||||
|
@@ -50,6 +50,9 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
* <b>created:</b> -- cm:created
|
||||
* </pre>
|
||||
*
|
||||
* TIKA Note - all the fields (plus a few others) are present
|
||||
* in the tika metadata.
|
||||
*
|
||||
* @author Jesper Steen Møller
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
@@ -95,6 +98,10 @@ public class PdfBoxMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
Calendar created = docInfo.getCreationDate();
|
||||
if (created != null)
|
||||
{
|
||||
// Work around https://issues.apache.org/jira/browse/PDFBOX-598
|
||||
created.set(Calendar.MILLISECOND, 0);
|
||||
|
||||
// Save
|
||||
putRawValue(KEY_CREATED, created.getTime(), rawProperties);
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,14 @@
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Calendar;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.pdfbox.util.DateConverter;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter
|
||||
@@ -41,4 +49,29 @@ public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
testExtractFromMimetype(MimetypeMap.MIMETYPE_PDF);
|
||||
}
|
||||
|
||||
/**
|
||||
* We can also return a created date
|
||||
*/
|
||||
protected void testFileSpecificMetadata(String mimetype,
|
||||
Map<QName, Serializable> properties) {
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
|
||||
"2005-05-26T20:52:58.000+01:00",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that will show when the workaround is in place.
|
||||
*/
|
||||
public void testDateConversion() throws Exception {
|
||||
Calendar c = DateConverter.toCalendar("D:20050526205258+01'00'");
|
||||
assertEquals(2005, c.get(Calendar.YEAR));
|
||||
assertEquals(05-1, c.get(Calendar.MONTH));
|
||||
assertEquals(26, c.get(Calendar.DAY_OF_MONTH));
|
||||
assertEquals(20, c.get(Calendar.HOUR_OF_DAY));
|
||||
assertEquals(52, c.get(Calendar.MINUTE));
|
||||
assertEquals(58, c.get(Calendar.SECOND));
|
||||
//assertEquals(0, c.get(Calendar.MILLISECOND));
|
||||
}
|
||||
}
|
||||
|
@@ -45,29 +45,33 @@ import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
/**
|
||||
* Metadata extractor for the PDF documents.
|
||||
* Metadata extractor for RFC822 mime emails.
|
||||
* <pre>
|
||||
* <b>messageFrom:</b> -- imap:messageFrom
|
||||
* <b>messageFrom:</b> -- imap:messageFrom, cm:originator
|
||||
* <b>messageTo:</b> -- imap:messageTo
|
||||
* <b>messageCc:</b> -- imap:messageCc
|
||||
* <b>messageSubject:</b> -- imap:messageSubject, cm:title, cm:description
|
||||
* <b>messageSent:</b> -- imap:dateSent
|
||||
* <b>messageSubject:</b> -- imap:messageSubject, cm:title, cm:description, cm:subjectline
|
||||
* <b>messageSent:</b> -- imap:dateSent, cm:sentdate
|
||||
* <b>All <code>{@link Header#getName() header names}:</b>
|
||||
* <b>Thread-Index:</b> -- imap:threadIndex
|
||||
* <b>Message-ID:</b> -- imap:messageId
|
||||
* <b>date:</b> -- imap:dateReceived
|
||||
*
|
||||
* TIKA Note - to and cc are missing, and date stuff isn't
|
||||
* great. Thread index is missing, and arbitrary headers
|
||||
* don't seem to be supported
|
||||
*
|
||||
* @author Derek Hulley
|
||||
* @since 3.2
|
||||
*/
|
||||
public class RFC822MetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
{
|
||||
|
||||
private static final String KEY_MESSAGE_FROM = "messageFrom";
|
||||
private static final String KEY_MESSAGE_TO = "messageTo";
|
||||
private static final String KEY_MESSAGE_CC = "messageCc";
|
||||
private static final String KEY_MESSAGE_SUBJECT = "messageSubject";
|
||||
private static final String KEY_MESSAGE_SENT = "messageSent";
|
||||
protected static final String KEY_MESSAGE_FROM = "messageFrom";
|
||||
protected static final String KEY_MESSAGE_TO = "messageTo";
|
||||
protected static final String KEY_MESSAGE_CC = "messageCc";
|
||||
protected static final String KEY_MESSAGE_SUBJECT = "messageSubject";
|
||||
protected static final String KEY_MESSAGE_SENT = "messageSent";
|
||||
|
||||
public static String[] SUPPORTED_MIMETYPES = new String[] { MimetypeMap.MIMETYPE_RFC822 };
|
||||
|
||||
|
@@ -9,14 +9,14 @@ namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
# Mappings
|
||||
|
||||
#Default values that doesn't match to Header
|
||||
messageFrom=imap:messageFrom
|
||||
messageFrom=imap:messageFrom, cm:originator
|
||||
messageTo=imap:messageTo
|
||||
messageCc=imap:messageCc
|
||||
messageSubject=imap:messageSubject, cm:title, cm:description
|
||||
messageSent=imap:dateSent
|
||||
messageSubject=imap:messageSubject, cm:title, cm:description, cm:subjectline
|
||||
messageSent=imap:dateSent, cm:sentdate
|
||||
|
||||
|
||||
#Add here any values you want to extract. Use Header name for key.
|
||||
Thread-Index=imap:threadIndex
|
||||
Message-ID=imap:messageId
|
||||
Date=imap:dateReceived
|
||||
Date=imap:dateReceived
|
||||
|
@@ -0,0 +1,198 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen Møller
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
* As a special exception to the terms and conditions of version 2.0 of
|
||||
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||
* FLOSS exception. You should have recieved a copy of the text describing
|
||||
* the FLOSS exception, and it is also available here:
|
||||
* http://www.alfresco.com/legal/licensing"
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.Field;
|
||||
import java.text.DateFormat;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
/**
|
||||
* Test for the RFC822 (imap/mbox) extractor
|
||||
*/
|
||||
public class RFC822MetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private RFC822MetadataExtracter extracter;
|
||||
|
||||
private static final QName MESSAGE_FROM_TEST_PROPERTY =
|
||||
QName.createQName("MessageToTest");
|
||||
private static final QName MESSAGE_TO_TEST_PROPERTY =
|
||||
QName.createQName("MessageFromTest");
|
||||
private static final QName MESSAGE_CC_TEST_PROPERTY =
|
||||
QName.createQName("MessageCCTest");
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception
|
||||
{
|
||||
super.setUp();
|
||||
|
||||
// Ask Spring for the extractor, so it
|
||||
// gets its date formats populated
|
||||
extracter = (RFC822MetadataExtracter)ctx.getBean("extracter.RFC822");
|
||||
|
||||
// Attach a couple of extra mappings
|
||||
// These will be tested later
|
||||
HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
|
||||
extracter.getMapping()
|
||||
);
|
||||
|
||||
Set<QName> fromSet = new HashSet<QName>();
|
||||
fromSet.add(MESSAGE_FROM_TEST_PROPERTY);
|
||||
fromSet.addAll( extracter.getCurrentMapping().get(RFC822MetadataExtracter.KEY_MESSAGE_FROM) );
|
||||
newMap.put( RFC822MetadataExtracter.KEY_MESSAGE_FROM, fromSet );
|
||||
|
||||
Set<QName> toSet = new HashSet<QName>();
|
||||
toSet.add(MESSAGE_TO_TEST_PROPERTY);
|
||||
toSet.addAll( extracter.getCurrentMapping().get(RFC822MetadataExtracter.KEY_MESSAGE_TO) );
|
||||
newMap.put( RFC822MetadataExtracter.KEY_MESSAGE_TO, toSet );
|
||||
|
||||
Set<QName> ccSet = new HashSet<QName>();
|
||||
ccSet.add(MESSAGE_CC_TEST_PROPERTY);
|
||||
ccSet.addAll( extracter.getCurrentMapping().get(RFC822MetadataExtracter.KEY_MESSAGE_CC) );
|
||||
newMap.put( RFC822MetadataExtracter.KEY_MESSAGE_CC, ccSet );
|
||||
|
||||
extracter.setMapping(newMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the same transformer regardless - it is allowed
|
||||
*/
|
||||
protected MetadataExtracter getExtracter()
|
||||
{
|
||||
return extracter;
|
||||
}
|
||||
|
||||
/**
|
||||
* RFC822 has a non-standard date format.
|
||||
* Check that this was sprung-in - if not, then
|
||||
* other tests will fail!
|
||||
*/
|
||||
public void testHasDateFormats() throws Exception {
|
||||
Set<DateFormat> supportedDateFormats;
|
||||
|
||||
Field sdf = RFC822MetadataExtracter.class.getSuperclass().
|
||||
getDeclaredField("supportedDateFormats");
|
||||
sdf.setAccessible(true);
|
||||
supportedDateFormats = (Set<DateFormat>)sdf.get(extracter);
|
||||
|
||||
if(supportedDateFormats.size() == 0) {
|
||||
fail("No supportedDateFormats injected into RFC822MetadataExtracter - " +
|
||||
"spring setup broken and date parsing will break all of the extraction process");
|
||||
}
|
||||
}
|
||||
|
||||
public void testSupports() throws Exception
|
||||
{
|
||||
for (String mimetype : RFC822MetadataExtracter.SUPPORTED_MIMETYPES)
|
||||
{
|
||||
boolean supports = extracter.isSupported(mimetype);
|
||||
assertTrue("Mimetype should be supported: " + mimetype, supports);
|
||||
}
|
||||
}
|
||||
|
||||
public void testEmailExtraction() throws Exception
|
||||
{
|
||||
testExtractFromMimetype(MimetypeMap.MIMETYPE_RFC822);
|
||||
}
|
||||
|
||||
/**
|
||||
* We have no author, and have the same title and description
|
||||
*/
|
||||
protected void testCommonMetadata(String mimetype,
|
||||
Map<QName, Serializable> properties) {
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
|
||||
QUICK_TITLE,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE)));
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype,
|
||||
QUICK_TITLE,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test our extra IMAP properties
|
||||
*/
|
||||
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {
|
||||
// Check the other cm: ones
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_ORIGINATOR + " not found for mimetype " + mimetype,
|
||||
QUICK_CREATOR + " <" + QUICK_CREATOR_EMAIL + ">",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_ORIGINATOR)));
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_SENTDATE + " not found for mimetype " + mimetype,
|
||||
"2004-06-04T13:23:22.000+01:00",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_SENTDATE)));
|
||||
|
||||
// Check some imap: ones
|
||||
assertEquals(
|
||||
"Test Property " + MESSAGE_FROM_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
"Nevin Nollop <nevin.nollop@alfresco.com>",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_FROM_TEST_PROPERTY)));
|
||||
assertEquals(
|
||||
"Test Property " + MESSAGE_FROM_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
"Nevin Nollop <nevin.nollop@alfresco.com>",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_FROM_TEST_PROPERTY)));
|
||||
assertEquals(
|
||||
"Test Property " + MESSAGE_TO_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
"Nevin Nollop <nevin.nollop@alfresco.com>",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_TO_TEST_PROPERTY)));
|
||||
|
||||
// Finally check our non-standard ones we added in at test time
|
||||
assertTrue(
|
||||
"Test Property " + MESSAGE_FROM_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
properties.containsKey(MESSAGE_FROM_TEST_PROPERTY)
|
||||
);
|
||||
assertTrue(
|
||||
"Test Property " + MESSAGE_TO_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
properties.containsKey(MESSAGE_TO_TEST_PROPERTY)
|
||||
);
|
||||
assertTrue(
|
||||
"Test Property " + MESSAGE_CC_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
properties.containsKey(MESSAGE_CC_TEST_PROPERTY)
|
||||
);
|
||||
|
||||
assertEquals(
|
||||
"Test Property " + MESSAGE_FROM_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
"Nevin Nollop <nevin.nollop@alfresco.com>",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_FROM_TEST_PROPERTY)));
|
||||
assertEquals(
|
||||
"Test Property " + MESSAGE_TO_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
"Nevin Nollop <nevin.nollop@alfresco.com>",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_TO_TEST_PROPERTY)));
|
||||
assertEquals(
|
||||
"Test Property " + MESSAGE_CC_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
"Nevin Nollop <nevinn@alfresco.com>",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_CC_TEST_PROPERTY)));
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user