Update metadata extractors - Outlook, MP3, Mail and PDF improvements, and increase test coverage

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@18454 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-02-04 14:42:45 +00:00
parent f2554d0f63
commit bd1e3edf76
19 changed files with 707 additions and 225 deletions

View File

@@ -787,6 +787,12 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
propertyTypeDef,
(Collection) propertyValue);
}
else if (propertyValue instanceof Object[])
{
convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert(
propertyTypeDef,
(Object[]) propertyValue);
}
else
{
convertedPropertyValue = (Serializable) DefaultTypeConverter.INSTANCE.convert(

View File

@@ -56,11 +56,16 @@ import org.springframework.context.ApplicationContext;
*/
public abstract class AbstractMetadataExtracterTest extends TestCase
{
static {
ApplicationContextHelper.setUseLazyLoading(false);
}
protected static ApplicationContext ctx = ApplicationContextHelper.getApplicationContext();
protected static final String QUICK_TITLE = "The quick brown fox jumps over the lazy dog";
protected static final String QUICK_DESCRIPTION = "Gym class featuring a brown fox and lazy dog";
protected static final String QUICK_CREATOR = "Nevin Nollop";
protected static final String QUICK_CREATOR_EMAIL = "nevin.nollop@alfresco.com";
protected static final String QUICK_PREVIOUS_AUTHOR = "Derek Hulley";
protected MimetypeMap mimetypeMap;
protected DictionaryService dictionaryService;
@@ -97,12 +102,16 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
try
{
Map<QName, Serializable> properties = extractFromMimetype(mimetype);
// check
// check common metadata
testCommonMetadata(mimetype, properties);
// check file-type specific metadata
testFileSpecificMetadata(mimetype, properties);
}
catch (FileNotFoundException e)
{
// The test file is not there. We won't fail it.
System.err.println("No test file found for mime type " + mimetype +
", skipping extraction test - " + e.getMessage());
}
}
@@ -127,17 +136,46 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
return properties;
}
/**
* Tests that we can get the common metadata correctly
* from the file.
* You only need to override this if your test data file
* doesn't have the usual Nevin Nollop/quick brown fox
* data in it.
*/
protected void testCommonMetadata(String mimetype, Map<QName, Serializable> properties)
{
assertEquals(
// One of Creator or Author
if(!skipAuthorCheck()) {
if(properties.containsKey(ContentModel.PROP_CREATOR)) {
assertEquals(
"Property " + ContentModel.PROP_CREATOR + " not found for mimetype " + mimetype,
QUICK_CREATOR,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATOR)));
} else if(properties.containsKey(ContentModel.PROP_AUTHOR)) {
assertEquals(
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
QUICK_CREATOR,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
} else {
fail("Expected on Property out of " + ContentModel.PROP_CREATOR + " and " +
ContentModel.PROP_AUTHOR + " but found neither of them.");
}
}
// Title and description
assertEquals(
"Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
QUICK_TITLE,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE)));
assertEquals(
assertEquals(
"Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype,
QUICK_DESCRIPTION,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
}
protected abstract void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties);
protected boolean skipAuthorCheck() { return false; }
public void testZeroLengthFile() throws Exception
{
@@ -163,4 +201,15 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
assertEquals("There should not be any new properties", 0, properties.size());
}
}
protected void assertContains(String message, String needle, String haystack) {
if(haystack.indexOf(needle) > -1) {
return;
}
fail(message);
}
protected void assertContains(String needle, String haystack) {
assertContains("'" + needle + "' wasn't found in '" + haystack + "'", needle, haystack);
}
}

View File

@@ -49,6 +49,10 @@ import org.alfresco.service.cmr.repository.ContentReader;
* <b>description:</b> -- cm:description
* </pre>
*
* TIKA note - all metadata will be present, but will need to
* search for the varient names ourselves as tika puts them
* in as-is.
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/

View File

@@ -24,7 +24,11 @@
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Map;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.namespace.QName;
/**
* @author Jesper Steen Møller
@@ -63,4 +67,7 @@ public class HtmlMetadataExtracterTest extends AbstractMetadataExtracterTest
{
testExtractFromMimetype(MimetypeMap.MIMETYPE_HTML);
}
/** Extractor only does the usual basic three properties */
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {}
}

View File

@@ -57,6 +57,10 @@ import org.farng.mp3.lyrics3.Lyrics3v2Field;
* <b>lyrics:</b> -- {music}lyrics
* </pre>
*
* TIKA Note - title and author go in metadata, but much of the
* rest is only in the text. Some of the ID3v2 parts
* (composer, lyrics) are not yet implemented.
*
* @author Roy Wetherall
*/
public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
@@ -91,7 +95,8 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
reader.getContent(tempFile);
// Create the MP3 object from the file
MP3File mp3File = new MP3File(tempFile);
// Open it read only as we won't make any changes
MP3File mp3File = new MP3File(tempFile, false);
ID3v1 id3v1 = mp3File.getID3v1Tag();
if (id3v1 != null)
@@ -141,6 +146,24 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
}
}
}
catch(Exception e)
{
if (logger.isDebugEnabled())
{
logger.debug(
"MP3 Metadata extraction failed: \n" +
" Content: " + reader,
e);
}
else
{
logger.warn(
"MP3 Metadata extraction failed (turn on DEBUG for full error): \n" +
" Content: " + reader + "\n" +
" Failure: " + e.getMessage());
}
}
finally
{
@@ -167,16 +190,22 @@ public class MP3MetadataExtracter extends AbstractMappingMetadataExtracter
private String getDescription(Map<String, Serializable> props)
{
StringBuilder result = new StringBuilder();
if (props.get(KEY_SONG_TITLE) != null && props.get(KEY_ARTIST) != null && props.get(KEY_ALBUM_TITLE) != null)
if (props.get(KEY_SONG_TITLE) != null)
{
result
.append(props.get(KEY_SONG_TITLE))
result.append(props.get(KEY_SONG_TITLE));
if (props.get(KEY_ALBUM_TITLE) != null)
{
result
.append(" - ")
.append(props.get(KEY_ALBUM_TITLE))
.append(props.get(KEY_ALBUM_TITLE));
}
if (props.get(KEY_ARTIST) != null)
{
result
.append(" (")
.append(props.get(KEY_ARTIST))
.append(")");
}
}
return result.toString();

View File

@@ -0,0 +1,120 @@
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.namespace.QName;
/**
* Test for the MP3 metadata extraction from id3 tags.
*/
public class MP3MetadataExtracterTest extends AbstractMetadataExtracterTest
{
private MP3MetadataExtracter extracter;
private static final String ARTIST = "Hauskaz";
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = new MP3MetadataExtracter();
extracter.setDictionaryService(dictionaryService);
extracter.register();
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected MetadataExtracter getExtracter()
{
return extracter;
}
public void testSupports() throws Exception
{
for (String mimetype : MP3MetadataExtracter.SUPPORTED_MIMETYPES)
{
boolean supports = extracter.isSupported(mimetype);
assertTrue("Mimetype should be supported: " + mimetype, supports);
}
}
public void testMP3Extraction() throws Exception
{
testExtractFromMimetype(MimetypeMap.MIMETYPE_MP3);
}
/**
* We don't have quite the usual metadata. Tests the descriptions one.
* Other tests in {@link #testFileSpecificMetadata(String, Map)}
*/
protected void testCommonMetadata(String mimetype, Map<QName, Serializable> properties) {
// Title is as normal
assertEquals(
"Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
QUICK_TITLE,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE)));
// Has Author, not Creator, and is different
assertEquals(
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
"Hauskaz",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
// Description is a composite
assertContains(
"Property " + ContentModel.PROP_DESCRIPTION + " didn't contain " + QUICK_TITLE + " for mimetype " + mimetype,
QUICK_TITLE,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
// Check rest of it later
}
/**
* Tests for various MP3 specific bits of metadata
*/
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {
QName songTitle = QName.createQName("music","songTitle");
assertEquals(
"Property " + songTitle + " not found for mimetype " + mimetype,
QUICK_TITLE,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(songTitle)));
QName songArtist = QName.createQName("music","artist");
assertEquals(
"Property " + songArtist + " not found for mimetype " + mimetype,
ARTIST,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(songArtist)));
// Description is a composite - check the artist part
assertContains(
"Property " + ContentModel.PROP_DESCRIPTION + " didn't contain " + ARTIST + " for mimetype " + mimetype,
ARTIST,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
}
}

View File

@@ -27,20 +27,13 @@ package org.alfresco.repo.content.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.hsmf.MAPIMessage;
/**
* Outlook format email meta-data extractor extracting the following values:
@@ -52,6 +45,9 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
* <b>subjectLine:</b> -- cm:subjectline, cm:description
* </pre>
*
* TIKA note - to/cc/bcc go into the html part, not the metadata.
* Also, email addresses not included as yet.
*
* @since 2.1
* @author Kevin Roast
*/
@@ -65,12 +61,6 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_OUTLOOK_MSG};
private static final String STREAM_PREFIX = "__substg1.0_";
private static final int STREAM_PREFIX_LENGTH = STREAM_PREFIX.length();
// the CC: email addresses
private ThreadLocal<List<String>> receipientEmails = new ThreadLocal<List<String>>();
public MailMetadataExtracter()
{
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
@@ -81,37 +71,25 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
{
final Map<String, Serializable> rawProperties = newRawMap();
POIFSReaderListener readerListener = new POIFSReaderListener()
{
public void processPOIFSReaderEvent(final POIFSReaderEvent event)
{
try
{
if (event.getName().startsWith(STREAM_PREFIX))
{
StreamHandler handler = new StreamHandler(event.getName(), event.getStream());
handler.process(rawProperties);
}
}
catch (Exception ex)
{
throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
}
}
};
InputStream is = null;
try
{
this.receipientEmails.set(new ArrayList<String>());
is = reader.getContentInputStream();
POIFSReader poiFSReader = new POIFSReader();
poiFSReader.registerListener(readerListener);
MAPIMessage msg;
try
{
poiFSReader.read(is);
msg = new MAPIMessage(is);
msg.setReturnNullOnMissingChunk(true);
putRawValue(KEY_ORIGINATOR, msg.getDisplayFrom(), rawProperties);
putRawValue(KEY_SUBJECT, msg.getSubject(), rawProperties);
putRawValue(KEY_SENT_DATE, msg.getMessageDate().getTime(), rawProperties);
// Store the TO, but not cc/bcc in the addressee field
putRawValue(KEY_ADDRESSEE, msg.getDisplayTo(), rawProperties);
// But store all email addresses (to/cc/bcc) in the addresses field
putRawValue(KEY_ADDRESSEES, msg.getRecipientEmailAddressList(), rawProperties);
}
catch (IOException err)
{
@@ -119,12 +97,6 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
if (logger.isWarnEnabled())
logger.warn("Unable to extract meta-data from message: " + err.getMessage());
}
// store multi-value extracted property
if (this.receipientEmails.get().size() != 0)
{
putRawValue(KEY_ADDRESSEES, (Serializable)receipientEmails.get(), rawProperties);
}
}
finally
{
@@ -136,162 +108,4 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter
// Done
return rawProperties;
}
private static String convertExchangeAddress(String email)
{
if (email.lastIndexOf("/CN=") == -1)
{
return email;
}
else
{
// found a full Exchange format To header
return email.substring(email.lastIndexOf("/CN=") + 4);
}
}
private static final String ENCODING_TEXT = "001E";
private static final String ENCODING_BINARY = "0102";
private static final String ENCODING_UNICODE = "001F";
@SuppressWarnings("unused")
private static final String SUBSTG_MESSAGEBODY = "1000";
private static final String SUBSTG_RECIPIENTEMAIL = "39FE"; // 7bit email address
private static final String SUBSTG_RECIPIENTSEARCH = "300B"; // address 'search' variant
private static final String SUBSTG_RECEIVEDEMAIL = "0076";
private static final String SUBSTG_SENDEREMAIL = "0C1F";
private static final String SUBSTG_DATE = "0047";
private static final String SUBSTG_SUBJECT = "0037";
/**
* Class to handle stream types. Can process and extract specific streams.
*/
private class StreamHandler
{
StreamHandler(String name, DocumentInputStream stream)
{
this.type = name.substring(STREAM_PREFIX_LENGTH, STREAM_PREFIX_LENGTH + 4);
this.encoding = name.substring(STREAM_PREFIX_LENGTH + 4, STREAM_PREFIX_LENGTH + 8);
this.stream = stream;
}
void process(final Map<String, Serializable> destination)
throws IOException
{
if (type.equals(SUBSTG_SENDEREMAIL))
{
putRawValue(KEY_ORIGINATOR, convertExchangeAddress(extractText()), destination);
}
else if (type.equals(SUBSTG_RECIPIENTEMAIL))
{
receipientEmails.get().add(convertExchangeAddress(extractText()));
}
else if (type.equals(SUBSTG_RECIPIENTSEARCH))
{
String email = extractText(ENCODING_TEXT);
int smptIndex = email.indexOf("SMTP:");
if (smptIndex != -1)
{
/* also may be used for SUBSTG_RECIPIENTTRANSPORT = "5FF7";
with search for SMPT followed by a null char */
// this is a secondary mechanism for encoding a receipient email address
// the 7 bit email address may not have been set by Outlook - so this is needed instead
// handle null character at end of string
int endIndex = email.length();
if (email.codePointAt(email.length() - 1) == 0)
{
endIndex--;
}
email = email.substring(smptIndex + 5, endIndex);
receipientEmails.get().add(email);
}
}
else if (type.equals(SUBSTG_RECEIVEDEMAIL))
{
putRawValue(KEY_ADDRESSEE, convertExchangeAddress(extractText()), destination);
}
else if (type.equals(SUBSTG_SUBJECT))
{
putRawValue(KEY_SUBJECT, extractText(), destination);
}
else if (type.equals(SUBSTG_DATE))
{
// the date is not "really" plain text - but it's appropriate to parse as such
String date = extractText(ENCODING_TEXT);
int valueIndex = date.indexOf("l=");
if (valueIndex != -1)
{
int dateIndex = date.indexOf('-', valueIndex);
if (dateIndex != -1)
{
dateIndex++;
final Calendar c = Calendar.getInstance();
String strYear = date.substring(dateIndex, dateIndex + 2);
c.set(Calendar.YEAR, Integer.parseInt(strYear) + (2000 - 1900));
String strMonth = date.substring(dateIndex + 2, dateIndex + 4);
c.set(Calendar.MONTH, Integer.parseInt(strMonth) - 1);
String strDay = date.substring(dateIndex + 4, dateIndex + 6);
c.set(Calendar.DAY_OF_MONTH, Integer.parseInt(strDay));
String strHour = date.substring(dateIndex + 6, dateIndex + 8);
c.set(Calendar.HOUR, Integer.parseInt(strHour));
String strMinute = date.substring(dateIndex + 10, dateIndex + 12);
c.set(Calendar.MINUTE, Integer.parseInt(strMinute));
c.set(Calendar.SECOND, 0);
putRawValue(KEY_SENT_DATE, c.getTime(), destination);
}
}
}
}
/**
* Extract the text from the stream based on the encoding
*
* @return String
*
* @throws IOException
*/
private String extractText()
throws IOException
{
return extractText(this.encoding);
}
/**
* Extract the text from the stream based on the encoding
*
* @return String
*
* @throws IOException
*/
private String extractText(String encoding)
throws IOException
{
byte[] data = new byte[stream.available()];
stream.read(data);
if (encoding.equals(ENCODING_TEXT) || encoding.equals(ENCODING_BINARY))
{
return new String(data);
}
else if (encoding.equals(ENCODING_UNICODE))
{
// convert double-byte encoding to single byte for String conversion
byte[] b = new byte[data.length >> 1];
for (int i=0; i<b.length; i++)
{
b[i] = data[i << 1];
}
return new String(b);
}
else
{
return new String(data);
}
}
private String type;
private String encoding;
private DocumentInputStream stream;
}
}

View File

@@ -25,6 +25,7 @@
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Collection;
import java.util.Map;
import org.alfresco.model.ContentModel;
@@ -71,15 +72,58 @@ public class MailMetadataExtracterTest extends AbstractMetadataExtracterTest
testExtractFromMimetype(MimetypeMap.MIMETYPE_OUTLOOK_MSG);
}
/**
* We have different things to normal, so
* do our own common tests.
*/
protected void testCommonMetadata(String mimetype, Map<QName, Serializable> properties)
{
assertEquals(
"Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
"KEVIN.ROAST@BEN",
"Kevin Roast",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
assertEquals(
"Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype,
"Test the content transformer",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
}
/**
* Test the outlook specific bits
*/
protected void testFileSpecificMetadata(String mimetype,
Map<QName, Serializable> properties) {
// Sent Date
assertEquals(
"Property " + ContentModel.PROP_SENTDATE + " not found for mimetype " + mimetype,
"2007-06-14T09:42:55.000+01:00",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_SENTDATE)));
// Addressee
assertEquals(
"Property " + ContentModel.PROP_ADDRESSEE + " not found for mimetype " + mimetype,
"Kevin Roast",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_ADDRESSEE)));
// Addressees
Collection<String> addressees = (Collection<String>)properties.get(ContentModel.PROP_ADDRESSEES);
assertTrue(
"Property " + ContentModel.PROP_ADDRESSEES + " not found for mimetype " + mimetype,
addressees != null
);
assertEquals(
"Property " + ContentModel.PROP_ADDRESSEES + " wrong size for mimetype " + mimetype,
1,
addressees.size());
assertEquals(
"Property " + ContentModel.PROP_ADDRESSEES + " wrong content for mimetype " + mimetype,
"kevin.roast@alfresco.org",
DefaultTypeConverter.INSTANCE.convert(String.class, addressees.iterator().next()));
// Subject Line
assertEquals(
"Property " + ContentModel.PROP_SUBJECT + " not found for mimetype " + mimetype,
"Test the content transformer",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_SUBJECT)));
}
}

View File

@@ -62,6 +62,9 @@ import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
* <b>wordCount:</b>
* </pre>
*
* TIKA Note - everything we currently have should be present
* in the metadata.
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/

View File

@@ -1,5 +1,16 @@
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.namespace.QName;
/**
* @see OfficeMetadataExtracter
@@ -9,6 +20,11 @@ package org.alfresco.repo.content.metadata;
public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
{
private OfficeMetadataExtracter extracter;
private static final QName WORD_COUNT_TEST_PROPERTY =
QName.createQName("WordCountTest");
private static final QName LAST_AUTHOR_TEST_PROPERTY =
QName.createQName("LastAuthorTest");
@Override
public void setUp() throws Exception
@@ -17,6 +33,22 @@ public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
extracter = new OfficeMetadataExtracter();
extracter.setDictionaryService(dictionaryService);
extracter.register();
// Attach a couple of extra mappings
// These will be tested later
HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
extracter.getMapping()
);
Set<QName> wcSet = new HashSet<QName>();
wcSet.add(WORD_COUNT_TEST_PROPERTY);
newMap.put( OfficeMetadataExtracter.KEY_WORD_COUNT, wcSet );
Set<QName> laSet = new HashSet<QName>();
laSet.add(LAST_AUTHOR_TEST_PROPERTY);
newMap.put( OfficeMetadataExtracter.KEY_LAST_AUTHOR, laSet );
extracter.setMapping(newMap);
}
/**
@@ -46,4 +78,78 @@ public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
testExtractFromMimetype(mimetype);
}
}
/**
* We support all sorts of extra metadata. Check it all behaves.
*/
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {
// Test the ones with a core alfresco mapping
if(mimetype.equals(MimetypeMap.MIMETYPE_WORD)) {
assertEquals(
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
"2005-05-26T13:57:00.000+01:00",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
assertEquals(
"Property " + ContentModel.PROP_MODIFIED + " not found for mimetype " + mimetype,
"2005-09-20T18:25:00.000+01:00",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_MODIFIED)));
} else if(mimetype.equals(MimetypeMap.MIMETYPE_EXCEL)) {
assertEquals(
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
"1996-10-15T00:33:28.000+01:00",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
assertEquals(
"Property " + ContentModel.PROP_MODIFIED + " not found for mimetype " + mimetype,
"2005-09-20T19:22:32.000+01:00",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_MODIFIED)));
} else if(mimetype.equals(MimetypeMap.MIMETYPE_PPT)) {
assertEquals(
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
"1601-01-01T00:00:00.000Z", // Seriously, that's what the file says!
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
assertEquals(
"Property " + ContentModel.PROP_MODIFIED + " not found for mimetype " + mimetype,
"2005-09-20T19:23:41.000+01:00",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_MODIFIED)));
}
// Now check the non-standard ones we added in at test time
assertTrue(
"Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype,
properties.containsKey(WORD_COUNT_TEST_PROPERTY)
);
assertTrue(
"Test Property " + LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype,
properties.containsKey(LAST_AUTHOR_TEST_PROPERTY)
);
if(mimetype.equals(MimetypeMap.MIMETYPE_WORD)) {
assertEquals(
"Test Property " + WORD_COUNT_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
"9",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(WORD_COUNT_TEST_PROPERTY)));
assertEquals(
"Test Property " + LAST_AUTHOR_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
AbstractMetadataExtracterTest.QUICK_PREVIOUS_AUTHOR,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(LAST_AUTHOR_TEST_PROPERTY)));
} else if(mimetype.equals(MimetypeMap.MIMETYPE_EXCEL)) {
assertEquals(
"Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype,
"0",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(WORD_COUNT_TEST_PROPERTY)));
assertEquals(
"Test Property " + LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype,
AbstractMetadataExtracterTest.QUICK_PREVIOUS_AUTHOR,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(LAST_AUTHOR_TEST_PROPERTY)));
} else if(mimetype.equals(MimetypeMap.MIMETYPE_PPT)) {
assertEquals(
"Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype,
"9",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(WORD_COUNT_TEST_PROPERTY)));
assertEquals(
"Test Property " + LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype,
AbstractMetadataExtracterTest.QUICK_PREVIOUS_AUTHOR,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(LAST_AUTHOR_TEST_PROPERTY)));
}
}
}

View File

@@ -60,6 +60,10 @@ import com.catcode.odf.OpenDocumentMetadata;
* <b>All user properties</b>
* </pre>
*
* TIKA Note - not all of the metadata is currently
* extracted. Will probably need to add some more
* support to TIKA.
*
* @author Antti Jokipii
* @author Derek Hulley
*/

View File

@@ -1,5 +1,12 @@
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.namespace.QName;
/**
* @see OpenDocumentMetadataExtracter
@@ -46,4 +53,25 @@ public class OpenDocumentMetadataExtracterTest extends AbstractMetadataExtracter
testExtractFromMimetype(mimetype);
}
}
protected boolean skipAuthorCheck() { return true; }
/**
* We also provide the creation date - check that
*/
protected void testFileSpecificMetadata(String mimetype,
Map<QName, Serializable> properties) {
// Check for two cases
if(mimetype.equals("application/vnd.oasis.opendocument.text")) {
assertEquals(
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
"2005-09-06T23:34:00.000+01:00",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
} else if(mimetype.equals("application/vnd.oasis.opendocument.graphics")) {
assertEquals(
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
"2006-01-27T11:46:11.000Z",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
}
}
}

View File

@@ -39,6 +39,9 @@ import org.springframework.extensions.surf.util.PropertyCheck;
* <b>description:</b> -- cm:description
* </pre>
*
* TIKA Note - this probably won't be ported to TIKA. There's currently
* no support for these old formats in tika.
*
* @author Jesper Steen Møller
*/
public class OpenOfficeMetadataExtracter extends AbstractMappingMetadataExtracter implements OpenOfficeMetadataWorker

View File

@@ -22,8 +22,17 @@
* http://www.alfresco.com/legal/licensing" */
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Map;
import org.alfresco.service.namespace.QName;
/**
* Note - this test can sometimes fail if run on its own, as there
* can be a race condition with the OO process. Try running it as
* part of a suite if so, that normally seems to fix it!
*
* @author Jesper Steen Møller
*/
public class OpenOfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
@@ -77,4 +86,18 @@ public class OpenOfficeMetadataExtracterTest extends AbstractMetadataExtracterTe
testExtractFromMimetype(mimetype);
}
}
/**
* Only run the check if we have a connection
* to an OpenOffice instance
*/
protected void testCommonMetadata(String mimetype,
Map<QName, Serializable> properties) {
if(extracter.isConnected()) {
super.testCommonMetadata(mimetype, properties);
}
}
/** Extractor only does the usual basic three properties */
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {}
}

View File

@@ -50,6 +50,9 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
* <b>created:</b> -- cm:created
* </pre>
*
* TIKA Note - all the fields (plus a few others) are present
* in the tika metadata.
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/
@@ -95,6 +98,10 @@ public class PdfBoxMetadataExtracter extends AbstractMappingMetadataExtracter
Calendar created = docInfo.getCreationDate();
if (created != null)
{
// Work around https://issues.apache.org/jira/browse/PDFBOX-598
created.set(Calendar.MILLISECOND, 0);
// Save
putRawValue(KEY_CREATED, created.getTime(), rawProperties);
}
}

View File

@@ -1,6 +1,14 @@
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Calendar;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.namespace.QName;
import org.apache.pdfbox.util.DateConverter;
/**
* @see org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter
@@ -41,4 +49,29 @@ public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
{
testExtractFromMimetype(MimetypeMap.MIMETYPE_PDF);
}
/**
* We can also return a created date
*/
protected void testFileSpecificMetadata(String mimetype,
Map<QName, Serializable> properties) {
assertEquals(
"Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype,
"2005-05-26T20:52:58.000+01:00",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
}
/**
* Test that will show when the workaround is in place.
*/
public void testDateConversion() throws Exception {
Calendar c = DateConverter.toCalendar("D:20050526205258+01'00'");
assertEquals(2005, c.get(Calendar.YEAR));
assertEquals(05-1, c.get(Calendar.MONTH));
assertEquals(26, c.get(Calendar.DAY_OF_MONTH));
assertEquals(20, c.get(Calendar.HOUR_OF_DAY));
assertEquals(52, c.get(Calendar.MINUTE));
assertEquals(58, c.get(Calendar.SECOND));
//assertEquals(0, c.get(Calendar.MILLISECOND));
}
}

View File

@@ -45,29 +45,33 @@ import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
/**
* Metadata extractor for the PDF documents.
* Metadata extractor for RFC822 mime emails.
* <pre>
* <b>messageFrom:</b> -- imap:messageFrom
* <b>messageFrom:</b> -- imap:messageFrom, cm:originator
* <b>messageTo:</b> -- imap:messageTo
* <b>messageCc:</b> -- imap:messageCc
* <b>messageSubject:</b> -- imap:messageSubject, cm:title, cm:description
* <b>messageSent:</b> -- imap:dateSent
* <b>messageSubject:</b> -- imap:messageSubject, cm:title, cm:description, cm:subjectline
* <b>messageSent:</b> -- imap:dateSent, cm:sentdate
* <b>All <code>{@link Header#getName() header names}:</b>
* <b>Thread-Index:</b> -- imap:threadIndex
* <b>Message-ID:</b> -- imap:messageId
* <b>date:</b> -- imap:dateReceived
*
* TIKA Note - to and cc are missing, and date stuff isn't
* great. Thread index is missing, and arbitrary headers
* don't seem to be supported
*
* @author Derek Hulley
* @since 3.2
*/
public class RFC822MetadataExtracter extends AbstractMappingMetadataExtracter
{
private static final String KEY_MESSAGE_FROM = "messageFrom";
private static final String KEY_MESSAGE_TO = "messageTo";
private static final String KEY_MESSAGE_CC = "messageCc";
private static final String KEY_MESSAGE_SUBJECT = "messageSubject";
private static final String KEY_MESSAGE_SENT = "messageSent";
protected static final String KEY_MESSAGE_FROM = "messageFrom";
protected static final String KEY_MESSAGE_TO = "messageTo";
protected static final String KEY_MESSAGE_CC = "messageCc";
protected static final String KEY_MESSAGE_SUBJECT = "messageSubject";
protected static final String KEY_MESSAGE_SENT = "messageSent";
public static String[] SUPPORTED_MIMETYPES = new String[] { MimetypeMap.MIMETYPE_RFC822 };

View File

@@ -9,14 +9,14 @@ namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
# Mappings
#Default values that doesn't match to Header
messageFrom=imap:messageFrom
messageFrom=imap:messageFrom, cm:originator
messageTo=imap:messageTo
messageCc=imap:messageCc
messageSubject=imap:messageSubject, cm:title, cm:description
messageSent=imap:dateSent
messageSubject=imap:messageSubject, cm:title, cm:description, cm:subjectline
messageSent=imap:dateSent, cm:sentdate
#Add here any values you want to extract. Use Header name for key.
Thread-Index=imap:threadIndex
Message-ID=imap:messageId
Date=imap:dateReceived
Date=imap:dateReceived

View File

@@ -0,0 +1,198 @@
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.lang.reflect.Field;
import java.text.DateFormat;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.namespace.QName;
/**
* Test for the RFC822 (imap/mbox) extractor
*/
public class RFC822MetadataExtracterTest extends AbstractMetadataExtracterTest
{
private RFC822MetadataExtracter extracter;
private static final QName MESSAGE_FROM_TEST_PROPERTY =
QName.createQName("MessageToTest");
private static final QName MESSAGE_TO_TEST_PROPERTY =
QName.createQName("MessageFromTest");
private static final QName MESSAGE_CC_TEST_PROPERTY =
QName.createQName("MessageCCTest");
@Override
public void setUp() throws Exception
{
super.setUp();
// Ask Spring for the extractor, so it
// gets its date formats populated
extracter = (RFC822MetadataExtracter)ctx.getBean("extracter.RFC822");
// Attach a couple of extra mappings
// These will be tested later
HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
extracter.getMapping()
);
Set<QName> fromSet = new HashSet<QName>();
fromSet.add(MESSAGE_FROM_TEST_PROPERTY);
fromSet.addAll( extracter.getCurrentMapping().get(RFC822MetadataExtracter.KEY_MESSAGE_FROM) );
newMap.put( RFC822MetadataExtracter.KEY_MESSAGE_FROM, fromSet );
Set<QName> toSet = new HashSet<QName>();
toSet.add(MESSAGE_TO_TEST_PROPERTY);
toSet.addAll( extracter.getCurrentMapping().get(RFC822MetadataExtracter.KEY_MESSAGE_TO) );
newMap.put( RFC822MetadataExtracter.KEY_MESSAGE_TO, toSet );
Set<QName> ccSet = new HashSet<QName>();
ccSet.add(MESSAGE_CC_TEST_PROPERTY);
ccSet.addAll( extracter.getCurrentMapping().get(RFC822MetadataExtracter.KEY_MESSAGE_CC) );
newMap.put( RFC822MetadataExtracter.KEY_MESSAGE_CC, ccSet );
extracter.setMapping(newMap);
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected MetadataExtracter getExtracter()
{
return extracter;
}
/**
* RFC822 has a non-standard date format.
* Check that this was sprung-in - if not, then
* other tests will fail!
*/
public void testHasDateFormats() throws Exception {
Set<DateFormat> supportedDateFormats;
Field sdf = RFC822MetadataExtracter.class.getSuperclass().
getDeclaredField("supportedDateFormats");
sdf.setAccessible(true);
supportedDateFormats = (Set<DateFormat>)sdf.get(extracter);
if(supportedDateFormats.size() == 0) {
fail("No supportedDateFormats injected into RFC822MetadataExtracter - " +
"spring setup broken and date parsing will break all of the extraction process");
}
}
public void testSupports() throws Exception
{
for (String mimetype : RFC822MetadataExtracter.SUPPORTED_MIMETYPES)
{
boolean supports = extracter.isSupported(mimetype);
assertTrue("Mimetype should be supported: " + mimetype, supports);
}
}
public void testEmailExtraction() throws Exception
{
testExtractFromMimetype(MimetypeMap.MIMETYPE_RFC822);
}
/**
* We have no author, and have the same title and description
*/
protected void testCommonMetadata(String mimetype,
Map<QName, Serializable> properties) {
assertEquals(
"Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
QUICK_TITLE,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE)));
assertEquals(
"Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype,
QUICK_TITLE,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
}
/**
* Test our extra IMAP properties
*/
public void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties) {
// Check the other cm: ones
assertEquals(
"Property " + ContentModel.PROP_ORIGINATOR + " not found for mimetype " + mimetype,
QUICK_CREATOR + " <" + QUICK_CREATOR_EMAIL + ">",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_ORIGINATOR)));
assertEquals(
"Property " + ContentModel.PROP_SENTDATE + " not found for mimetype " + mimetype,
"2004-06-04T13:23:22.000+01:00",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_SENTDATE)));
// Check some imap: ones
assertEquals(
"Test Property " + MESSAGE_FROM_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
"Nevin Nollop <nevin.nollop@alfresco.com>",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_FROM_TEST_PROPERTY)));
assertEquals(
"Test Property " + MESSAGE_FROM_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
"Nevin Nollop <nevin.nollop@alfresco.com>",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_FROM_TEST_PROPERTY)));
assertEquals(
"Test Property " + MESSAGE_TO_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
"Nevin Nollop <nevin.nollop@alfresco.com>",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_TO_TEST_PROPERTY)));
// Finally check our non-standard ones we added in at test time
assertTrue(
"Test Property " + MESSAGE_FROM_TEST_PROPERTY + " not found for mimetype " + mimetype,
properties.containsKey(MESSAGE_FROM_TEST_PROPERTY)
);
assertTrue(
"Test Property " + MESSAGE_TO_TEST_PROPERTY + " not found for mimetype " + mimetype,
properties.containsKey(MESSAGE_TO_TEST_PROPERTY)
);
assertTrue(
"Test Property " + MESSAGE_CC_TEST_PROPERTY + " not found for mimetype " + mimetype,
properties.containsKey(MESSAGE_CC_TEST_PROPERTY)
);
assertEquals(
"Test Property " + MESSAGE_FROM_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
"Nevin Nollop <nevin.nollop@alfresco.com>",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_FROM_TEST_PROPERTY)));
assertEquals(
"Test Property " + MESSAGE_TO_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
"Nevin Nollop <nevin.nollop@alfresco.com>",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_TO_TEST_PROPERTY)));
assertEquals(
"Test Property " + MESSAGE_CC_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
"Nevin Nollop <nevinn@alfresco.com>",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(MESSAGE_CC_TEST_PROPERTY)));
}
}