Upgraded OpenDocumentMetadataExtracter to new infrastructure.

Added more OpenDocument test documents.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@5690 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Derek Hulley
2007-05-16 10:27:36 +00:00
parent 8c7782027f
commit f03f95325a
14 changed files with 273 additions and 49 deletions

View File

@@ -26,6 +26,7 @@ package org.alfresco.repo.content.metadata;
import java.io.InputStream;
import java.io.Serializable;
import java.lang.reflect.Array;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
@@ -62,7 +63,7 @@ import org.apache.commons.logging.LogFactory;
* <li>
* Implement the {@link extractInternal} method. This now returns a raw map of extracted
* values keyed by document-specific property names. The <b>trimPut</b> method has
* been replaced with an equivalent {@link #putSafeRawValue(String, Object, Map)}.
* been replaced with an equivalent {@link #putRawValue(String, Serializable, Map)}.
* </li>
* <li>
* Provide the default mapping of the document-specific properties to system-specific
@@ -241,6 +242,28 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
mapping = readMappingProperties(mappingProperties);
}
/**
* Helper method for derived classes to obtain the mappings that will be applied to raw
* values. This should be called after initialization in order to guarantee the complete
* map is given.
* <p>
* Normally, the list of properties that can be extracted from a document is fixed and
* well-known - in that case, just extract everything. But Some implementations may have
* an extra, indeterminate set of values available for extraction. If the extraction of
* these runtime parameters is expensive, then the keys provided by the return value can
* be used to extract values from the documents. The metadata extraction becomes fully
* configuration-driven, i.e. declaring further mappings will result in more values being
* extracted from the documents.
*/
protected final Map<String, Set<QName>> getMapping()
{
if (!initialized)
{
throw new UnsupportedOperationException("The complete mapping is only available after initialization.");
}
return Collections.unmodifiableMap(mapping);
}
/**
* A utility method to read mapping properties from a resource file and convert to the map form.
*
@@ -566,17 +589,26 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
}
return systemProperties;
}
/**
* Examines a value or string for nulls and adds it to the map (if non-empty). If the value
* is non-Serializable, then the <code>toString</code> representation used directly.
* Adds a value to the map if it is non-trivial. A value is trivial if
* <ul>
* <li>it is null</li>
* <li>it is an empty string value after trimming</li>
* <li>it is an empty collection</li>
* <li>it is an empty array</li>
* </ul>
* String values are trimmed before being put into the map.
* Otherwise, it is up to the extracter to ensure that the value is a <tt>Serializable</tt>.
* It is not appropriate to implicitly convert values in order to make them <tt>Serializable</tt>
* - the best conversion method will depend on the value's specific meaning.
*
* @param key the destination map key
* @param value the value to check and put.
* @param destination map to put values into
* @return Returns <tt>true</tt> if set, <tt>false</tt> otherwise
* @param key the destination key
* @param value the serializable value
* @param destination the map to put values into
* @return Returns <tt>true</tt> if set, otherwise <tt>false</tt>
*/
protected boolean putSafeRawValue(String key, Object value, Map<String, Serializable> destination)
protected boolean putRawValue(String key, Serializable value, Map<String, Serializable> destination)
{
if (value == null)
{
@@ -584,25 +616,47 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
}
if (value instanceof String)
{
String svalue = ((String) value).trim();
if (svalue.length() > 0)
String valueStr = ((String) value).trim();
if (valueStr.length() == 0)
{
destination.put(key, svalue);
return true;
return false;
}
else
{
// Keep the trimmed value
value = valueStr;
}
return false;
}
else if (value instanceof Serializable)
else if (value instanceof Collection)
{
destination.put(key, (Serializable) value);
Collection valueCollection = (Collection) value;
if (valueCollection.isEmpty())
{
return false;
}
}
else
else if (value.getClass().isArray())
{
destination.put(key, value.toString());
if (Array.getLength(value) == 0)
{
return false;
}
}
// It passed all the tests
destination.put(key, value);
return true;
}
/**
* Helper method to fetch a clean map into which raw values can be dumped.
*
* @return Returns an empty map
*/
protected final Map<String, Serializable> newRawMap()
{
return new HashMap<String, Serializable>(17);
}
/**
* This method provides a <i>best guess</i> of where to store the values extracted
* from the documents. The list of properties mapped by default need <b>not</b>

View File

@@ -86,9 +86,16 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
protected void testExtractFromMimetype(String mimetype) throws Exception
{
Map<QName, Serializable> properties = extractFromMimetype(mimetype);
// check
testCommonMetadata(mimetype, properties);
try
{
Map<QName, Serializable> properties = extractFromMimetype(mimetype);
// check
testCommonMetadata(mimetype, properties);
}
catch (FileNotFoundException e)
{
// The test file is not there. We won't fail it.
}
}
protected Map<QName, Serializable> extractFromMimetype(String mimetype) throws Exception

View File

@@ -45,6 +45,7 @@ public interface MetadataExtracter
* written into the property map or not.
*
* @author Derek Hulley
* @author Jesper Steen Møller
*/
public enum OverwritePolicy
{

View File

@@ -28,7 +28,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
@@ -43,18 +42,46 @@ import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
/**
* Office file format Metadata Extracter
* Office file format Metadata Extracter. This extracter uses the POI library to extract
* the following:
* <pre>
* <b>author:</b> -- cm:author
* <b>title:</b> -- cm:title
* <b>subject:</b> -- cm:description
* <b>createDateTime:</b> -- cm:created
* <b>lastSaveDateTime:</b> -- cm:modified
* <b>comments:</b>
* <b>editTime:</b>
* <b>format:</b>
* <b>keywords:</b>
* <b>lastAuthor:</b>
* <b>lastPrinted:</b>
* <b>osVersion:</b>
* <b>thumbnail:</b>
* <b>pageCount:</b>
* <b>wordCount:</b>
* </pre>
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/
public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter
{
public static final String PROP_AUTHOR = "author";
public static final String PROP_TITLE = "title";
public static final String PROP_SUBJECT = "subject";
public static final String PROP_CREATE_DATETIME = "createDateTime";
public static final String PROP_LAST_SAVE_DATETIME = "lastSaveDateTime";
public static final String KEY_AUTHOR = "author";
public static final String KEY_TITLE = "title";
public static final String KEY_SUBJECT = "subject";
public static final String KEY_CREATE_DATETIME = "createDateTime";
public static final String KEY_LAST_SAVE_DATETIME = "lastSaveDateTime";
public static final String KEY_COMMENTS = "comments";
public static final String KEY_EDIT_TIME = "editTime";
public static final String KEY_FORMAT = "format";
public static final String KEY_KEYWORDS = "keywords";
public static final String KEY_LAST_AUTHOR = "lastAuthor";
public static final String KEY_LAST_PRINTED = "lastPrinted";
public static final String KEY_OS_VERSION = "osVersion";
public static final String KEY_THUMBNAIL = "thumbnail";
public static final String KEY_PAGE_COUNT = "pageCount";
public static final String KEY_WORD_COUNT = "wordCount";
public static String[] SUPPORTED_MIMETYPES = new String[] {
MimetypeMap.MIMETYPE_WORD,
@@ -69,7 +96,7 @@ public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter
@Override
protected Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
final Map<String, Serializable> rawProperties = new HashMap<String, Serializable>(17);
final Map<String, Serializable> rawProperties = newRawMap();
POIFSReaderListener readerListener = new POIFSReaderListener()
{
@@ -82,11 +109,21 @@ public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter
{
SummaryInformation si = (SummaryInformation) ps;
putSafeRawValue(PROP_AUTHOR, si.getAuthor(), rawProperties);
putSafeRawValue(PROP_TITLE, si.getTitle(), rawProperties);
putSafeRawValue(PROP_SUBJECT, si.getSubject(), rawProperties);
putSafeRawValue(PROP_CREATE_DATETIME, si.getCreateDateTime(), rawProperties);
putSafeRawValue(PROP_LAST_SAVE_DATETIME, si.getLastSaveDateTime(), rawProperties);
putRawValue(KEY_AUTHOR, si.getAuthor(), rawProperties);
putRawValue(KEY_TITLE, si.getTitle(), rawProperties);
putRawValue(KEY_SUBJECT, si.getSubject(), rawProperties);
putRawValue(KEY_CREATE_DATETIME, si.getCreateDateTime(), rawProperties);
putRawValue(KEY_LAST_SAVE_DATETIME, si.getLastSaveDateTime(), rawProperties);
putRawValue(KEY_COMMENTS, si.getComments(), rawProperties);
putRawValue(KEY_EDIT_TIME, si.getEditTime(), rawProperties);
putRawValue(KEY_FORMAT, si.getFormat(), rawProperties);
putRawValue(KEY_KEYWORDS, si.getKeywords(), rawProperties);
putRawValue(KEY_LAST_AUTHOR, si.getLastAuthor(), rawProperties);
putRawValue(KEY_LAST_PRINTED, si.getLastPrinted(), rawProperties);
putRawValue(KEY_OS_VERSION, si.getOSVersion(), rawProperties);
putRawValue(KEY_THUMBNAIL, si.getThumbnail(), rawProperties);
putRawValue(KEY_PAGE_COUNT, si.getPageCount(), rawProperties);
putRawValue(KEY_WORD_COUNT, si.getWordCount(), rawProperties);
}
}
catch (Exception ex)

View File

@@ -2,7 +2,7 @@ package org.alfresco.repo.content.metadata;
/**
* @see org.alfresco.repo.content.transform.OfficeMetadataExtracter
* @see OfficeMetadataExtracter
*
* @author Jesper Steen Møller
*/
@@ -26,12 +26,12 @@ public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
return extracter;
}
public void testReliability() throws Exception
public void testSupports() throws Exception
{
for (String mimetype : OfficeMetadataExtracter.SUPPORTED_MIMETYPES)
{
double reliability = extracter.getReliability(mimetype);
assertTrue("Expected above zero reliability", reliability > 0.0);
boolean supports = extracter.isSupported(mimetype);
assertTrue("Mimetype should be supported: " + mimetype, supports);
}
}

View File

@@ -29,9 +29,10 @@ import java.io.InputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Map;
import java.util.Set;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
@@ -43,12 +44,39 @@ import com.catcode.odf.OpenDocumentMetadata;
* Metadata extractor for the
* {@link org.alfresco.repo.content.MimetypeMap#MIMETYPE_OPENDOCUMENT_TEXT MIMETYPE_OPENDOCUMENT_XXX}
* mimetypes.
* <pre>
* <b>creationDate:</b> -- cm:created
* <b>creator:</b> -- cm:author
* <b>date:</b>
* <b>description:</b> -- cm:description
* <b>generator:</b>
* <b>initialCreator:</b>
* <b>keyword:</b>
* <b>language:</b>
* <b>printDate:</b>
* <b>printedBy:</b>
* <b>subject:</b>
* <b>title:</b> -- cm:title
* </pre>
*
* @author Antti Jokipii
*/
public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
public class OpenDocumentMetadataExtracter extends AbstractMappingMetadataExtracter
{
private static String[] mimeTypes = new String[] {
private static final String KEY_CREATION_DATE = "creationDate";
private static final String KEY_CREATOR = "creator";
private static final String KEY_DATE = "date";
private static final String KEY_DESCRIPTION = "description";
private static final String KEY_GENERATOR = "generator";
private static final String KEY_INITIAL_CREATOR = "initialCreator";
private static final String KEY_KEYWORD = "keyword";
private static final String KEY_LANGUAGE = "language";
private static final String KEY_PRINT_DATE = "printDate";
private static final String KEY_PRINTED_BY = "printedBy";
private static final String KEY_SUBJECT = "subject";
private static final String KEY_TITLE = "title";
public static String[] SUPPORTED_MIMETYPES = new String[] {
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT,
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE,
MimetypeMap.MIMETYPE_OPENDOCUMENT_GRAPHICS,
@@ -65,15 +93,18 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
MimetypeMap.MIMETYPE_OPENDOCUMENT_FORMULA_TEMPLATE,
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_MASTER,
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_WEB,
MimetypeMap.MIMETYPE_OPENDOCUMENT_DATABASE, };
MimetypeMap.MIMETYPE_OPENDOCUMENT_DATABASE };
public OpenDocumentMetadataExtracter()
{
super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.00, 1000);
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
}
public void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable
@Override
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
Map<String, Serializable> rawProperties = newRawMap();
ODFMetaFileAnalyzer analyzer = new ODFMetaFileAnalyzer();
InputStream is = null;
try
@@ -84,11 +115,34 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
if (docInfo != null)
{
// set the metadata
destination.put(ContentModel.PROP_AUTHOR, docInfo.getCreator());
destination.put(ContentModel.PROP_TITLE, docInfo.getTitle());
destination.put(ContentModel.PROP_DESCRIPTION, docInfo.getDescription());
destination.put(ContentModel.PROP_CREATED, docInfo.getCreationDate());
putRawValue(KEY_CREATION_DATE, docInfo.getCreationDate(), rawProperties);
putRawValue(KEY_CREATOR, docInfo.getCreator(), rawProperties);
putRawValue(KEY_DATE, docInfo.getDate(), rawProperties);
putRawValue(KEY_DESCRIPTION, docInfo.getDescription(), rawProperties);
putRawValue(KEY_GENERATOR, docInfo.getGenerator(), rawProperties);
putRawValue(KEY_INITIAL_CREATOR, docInfo.getInitialCreator(), rawProperties);
putRawValue(KEY_KEYWORD, docInfo.getKeyword(), rawProperties);
putRawValue(KEY_LANGUAGE, docInfo.getLanguage(), rawProperties);
putRawValue(KEY_PRINT_DATE, docInfo.getPrintDate(), rawProperties);
putRawValue(KEY_PRINTED_BY, docInfo.getPrintedBy(), rawProperties);
putRawValue(KEY_SUBJECT, docInfo.getSubject(), rawProperties);
putRawValue(KEY_TITLE, docInfo.getTitle(), rawProperties);
// Handle user-defined properties dynamically
Map<String, Set<QName>> mapping = super.getMapping();
Hashtable userDefinedProperties = docInfo.getUserDefined();
// Extract those user properties for which there is a mapping
for (String key : mapping.keySet())
{
if (userDefinedProperties.containsKey(key))
{
Object value = userDefinedProperties.get(key);
if (value != null && value instanceof Serializable)
{
putRawValue(key, (Serializable) value, rawProperties);
}
}
}
}
}
finally
@@ -98,5 +152,7 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
try { is.close(); } catch (IOException e) {}
}
}
// Done
return rawProperties;
}
}

View File

@@ -0,0 +1,21 @@
#
# OpenDocumentMetadataExtracter - default mapping
#
# author: Derek Hulley
# Namespaces
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
# Mappings
creationDate=cm:created
creator=cm:author
date=
description=
generator=
initialCreator=
keyword=
language=
printDate=
printedBy=
subject=cm:description
title=cm:title

View File

@@ -0,0 +1,48 @@
package org.alfresco.repo.content.metadata;
/**
* @see OpenDocumentMetadataExtracter
*
* @author Derek Hulley
*/
public class OpenDocumentMetadataExtracterTest extends AbstractMetadataExtracterTest
{
private OpenDocumentMetadataExtracter extracter;
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = new OpenDocumentMetadataExtracter();
extracter.register();
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected MetadataExtracter getExtracter()
{
return extracter;
}
public void testSupports() throws Exception
{
for (String mimetype : OpenDocumentMetadataExtracter.SUPPORTED_MIMETYPES)
{
boolean supports = extracter.isSupported(mimetype);
assertTrue("Mimetype should be supported: " + mimetype, supports);
}
}
/**
* Test all the supported mimetypes
*/
public void testSupportedMimetypes() throws Exception
{
for (String mimetype : OpenDocumentMetadataExtracter.SUPPORTED_MIMETYPES)
{
testExtractFromMimetype(mimetype);
}
}
}