diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
index ed80676a2a..dd91e4d116 100644
--- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java
@@ -26,6 +26,7 @@ package org.alfresco.repo.content.metadata;
import java.io.InputStream;
import java.io.Serializable;
+import java.lang.reflect.Array;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
@@ -62,7 +63,7 @@ import org.apache.commons.logging.LogFactory;
*
* Provide the default mapping of the document-specific properties to system-specific
@@ -241,6 +242,28 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
mapping = readMappingProperties(mappingProperties);
}
+ /**
+ * Helper method for derived classes to obtain the mappings that will be applied to raw
+ * values. This should be called after initialization in order to guarantee the complete
+ * map is given.
+ *
+ * Normally, the list of properties that can be extracted from a document is fixed and
+ * well-known - in that case, just extract everything. But Some implementations may have
+ * an extra, indeterminate set of values available for extraction. If the extraction of
+ * these runtime parameters is expensive, then the keys provided by the return value can
+ * be used to extract values from the documents. The metadata extraction becomes fully
+ * configuration-driven, i.e. declaring further mappings will result in more values being
+ * extracted from the documents.
+ */
+ protected final Map> getMapping()
+ {
+ if (!initialized)
+ {
+ throw new UnsupportedOperationException("The complete mapping is only available after initialization.");
+ }
+ return Collections.unmodifiableMap(mapping);
+ }
+
/**
* A utility method to read mapping properties from a resource file and convert to the map form.
*
@@ -566,17 +589,26 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
}
return systemProperties;
}
-
+
/**
- * Examines a value or string for nulls and adds it to the map (if non-empty). If the value
- * is non-Serializable, then the toString
representation used directly.
+ * Adds a value to the map if it is non-trivial. A value is trivial if
+ *
+ * - it is null
+ * - it is an empty string value after trimming
+ * - it is an empty collection
+ * - it is an empty array
+ *
+ * String values are trimmed before being put into the map.
+ * Otherwise, it is up to the extracter to ensure that the value is a Serializable.
+ * It is not appropriate to implicitly convert values in order to make them Serializable
+ * - the best conversion method will depend on the value's specific meaning.
*
- * @param key the destination map key
- * @param value the value to check and put.
- * @param destination map to put values into
- * @return Returns true if set, false otherwise
+ * @param key the destination key
+ * @param value the serializable value
+ * @param destination the map to put values into
+ * @return Returns true if set, otherwise false
*/
- protected boolean putSafeRawValue(String key, Object value, Map destination)
+ protected boolean putRawValue(String key, Serializable value, Map destination)
{
if (value == null)
{
@@ -584,25 +616,47 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
}
if (value instanceof String)
{
- String svalue = ((String) value).trim();
- if (svalue.length() > 0)
+ String valueStr = ((String) value).trim();
+ if (valueStr.length() == 0)
{
- destination.put(key, svalue);
- return true;
+ return false;
+ }
+ else
+ {
+ // Keep the trimmed value
+ value = valueStr;
}
- return false;
}
- else if (value instanceof Serializable)
+ else if (value instanceof Collection)
{
- destination.put(key, (Serializable) value);
+ Collection valueCollection = (Collection) value;
+ if (valueCollection.isEmpty())
+ {
+ return false;
+ }
}
- else
+ else if (value.getClass().isArray())
{
- destination.put(key, value.toString());
+ if (Array.getLength(value) == 0)
+ {
+ return false;
+ }
}
+ // It passed all the tests
+ destination.put(key, value);
return true;
}
+ /**
+ * Helper method to fetch a clean map into which raw values can be dumped.
+ *
+ * @return Returns an empty map
+ */
+ protected final Map newRawMap()
+ {
+ return new HashMap(17);
+ }
+
/**
* This method provides a best guess of where to store the values extracted
* from the documents. The list of properties mapped by default need not
diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java
index 526e3ef48c..249c5fc709 100644
--- a/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java
+++ b/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java
@@ -86,9 +86,16 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
protected void testExtractFromMimetype(String mimetype) throws Exception
{
- Map properties = extractFromMimetype(mimetype);
- // check
- testCommonMetadata(mimetype, properties);
+ try
+ {
+ Map properties = extractFromMimetype(mimetype);
+ // check
+ testCommonMetadata(mimetype, properties);
+ }
+ catch (FileNotFoundException e)
+ {
+ // The test file is not there. We won't fail it.
+ }
}
protected Map extractFromMimetype(String mimetype) throws Exception
diff --git a/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java
index 8759f2f078..96a0e35afb 100644
--- a/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java
@@ -45,6 +45,7 @@ public interface MetadataExtracter
* written into the property map or not.
*
* @author Derek Hulley
+ * @author Jesper Steen Møller
*/
public enum OverwritePolicy
{
diff --git a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java
index 4f2b36f091..9b878b7917 100644
--- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java
@@ -28,7 +28,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Arrays;
-import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
@@ -43,18 +42,46 @@ import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
/**
- * Office file format Metadata Extracter
+ * Office file format Metadata Extracter. This extracter uses the POI library to extract
+ * the following:
+ *
+ * author: -- cm:author
+ * title: -- cm:title
+ * subject: -- cm:description
+ * createDateTime: -- cm:created
+ * lastSaveDateTime: -- cm:modified
+ * comments:
+ * editTime:
+ * format:
+ * keywords:
+ * lastAuthor:
+ * lastPrinted:
+ * osVersion:
+ * thumbnail:
+ * pageCount:
+ * wordCount:
+ *
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/
public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter
{
- public static final String PROP_AUTHOR = "author";
- public static final String PROP_TITLE = "title";
- public static final String PROP_SUBJECT = "subject";
- public static final String PROP_CREATE_DATETIME = "createDateTime";
- public static final String PROP_LAST_SAVE_DATETIME = "lastSaveDateTime";
+ public static final String KEY_AUTHOR = "author";
+ public static final String KEY_TITLE = "title";
+ public static final String KEY_SUBJECT = "subject";
+ public static final String KEY_CREATE_DATETIME = "createDateTime";
+ public static final String KEY_LAST_SAVE_DATETIME = "lastSaveDateTime";
+ public static final String KEY_COMMENTS = "comments";
+ public static final String KEY_EDIT_TIME = "editTime";
+ public static final String KEY_FORMAT = "format";
+ public static final String KEY_KEYWORDS = "keywords";
+ public static final String KEY_LAST_AUTHOR = "lastAuthor";
+ public static final String KEY_LAST_PRINTED = "lastPrinted";
+ public static final String KEY_OS_VERSION = "osVersion";
+ public static final String KEY_THUMBNAIL = "thumbnail";
+ public static final String KEY_PAGE_COUNT = "pageCount";
+ public static final String KEY_WORD_COUNT = "wordCount";
public static String[] SUPPORTED_MIMETYPES = new String[] {
MimetypeMap.MIMETYPE_WORD,
@@ -69,7 +96,7 @@ public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter
@Override
protected Map extractRaw(ContentReader reader) throws Throwable
{
- final Map rawProperties = new HashMap(17);
+ final Map rawProperties = newRawMap();
POIFSReaderListener readerListener = new POIFSReaderListener()
{
@@ -82,11 +109,21 @@ public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter
{
SummaryInformation si = (SummaryInformation) ps;
- putSafeRawValue(PROP_AUTHOR, si.getAuthor(), rawProperties);
- putSafeRawValue(PROP_TITLE, si.getTitle(), rawProperties);
- putSafeRawValue(PROP_SUBJECT, si.getSubject(), rawProperties);
- putSafeRawValue(PROP_CREATE_DATETIME, si.getCreateDateTime(), rawProperties);
- putSafeRawValue(PROP_LAST_SAVE_DATETIME, si.getLastSaveDateTime(), rawProperties);
+ putRawValue(KEY_AUTHOR, si.getAuthor(), rawProperties);
+ putRawValue(KEY_TITLE, si.getTitle(), rawProperties);
+ putRawValue(KEY_SUBJECT, si.getSubject(), rawProperties);
+ putRawValue(KEY_CREATE_DATETIME, si.getCreateDateTime(), rawProperties);
+ putRawValue(KEY_LAST_SAVE_DATETIME, si.getLastSaveDateTime(), rawProperties);
+ putRawValue(KEY_COMMENTS, si.getComments(), rawProperties);
+ putRawValue(KEY_EDIT_TIME, si.getEditTime(), rawProperties);
+ putRawValue(KEY_FORMAT, si.getFormat(), rawProperties);
+ putRawValue(KEY_KEYWORDS, si.getKeywords(), rawProperties);
+ putRawValue(KEY_LAST_AUTHOR, si.getLastAuthor(), rawProperties);
+ putRawValue(KEY_LAST_PRINTED, si.getLastPrinted(), rawProperties);
+ putRawValue(KEY_OS_VERSION, si.getOSVersion(), rawProperties);
+ putRawValue(KEY_THUMBNAIL, si.getThumbnail(), rawProperties);
+ putRawValue(KEY_PAGE_COUNT, si.getPageCount(), rawProperties);
+ putRawValue(KEY_WORD_COUNT, si.getWordCount(), rawProperties);
}
}
catch (Exception ex)
diff --git a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java
index f31ab28116..70bd2dd344 100644
--- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java
+++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java
@@ -2,7 +2,7 @@ package org.alfresco.repo.content.metadata;
/**
- * @see org.alfresco.repo.content.transform.OfficeMetadataExtracter
+ * @see OfficeMetadataExtracter
*
* @author Jesper Steen Møller
*/
@@ -26,12 +26,12 @@ public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
return extracter;
}
- public void testReliability() throws Exception
+ public void testSupports() throws Exception
{
for (String mimetype : OfficeMetadataExtracter.SUPPORTED_MIMETYPES)
{
- double reliability = extracter.getReliability(mimetype);
- assertTrue("Expected above zero reliability", reliability > 0.0);
+ boolean supports = extracter.isSupported(mimetype);
+ assertTrue("Mimetype should be supported: " + mimetype, supports);
}
}
diff --git a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java
index bf88f49bb3..039fc249c0 100644
--- a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java
@@ -29,9 +29,10 @@ import java.io.InputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
+import java.util.Hashtable;
import java.util.Map;
+import java.util.Set;
-import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
@@ -43,12 +44,39 @@ import com.catcode.odf.OpenDocumentMetadata;
* Metadata extractor for the
* {@link org.alfresco.repo.content.MimetypeMap#MIMETYPE_OPENDOCUMENT_TEXT MIMETYPE_OPENDOCUMENT_XXX}
* mimetypes.
+ *
+ * creationDate: -- cm:created
+ * creator: -- cm:author
+ * date:
+ * description: -- cm:description
+ * generator:
+ * initialCreator:
+ * keyword:
+ * language:
+ * printDate:
+ * printedBy:
+ * subject:
+ * title: -- cm:title
+ *
*
* @author Antti Jokipii
*/
-public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
+public class OpenDocumentMetadataExtracter extends AbstractMappingMetadataExtracter
{
- private static String[] mimeTypes = new String[] {
+ private static final String KEY_CREATION_DATE = "creationDate";
+ private static final String KEY_CREATOR = "creator";
+ private static final String KEY_DATE = "date";
+ private static final String KEY_DESCRIPTION = "description";
+ private static final String KEY_GENERATOR = "generator";
+ private static final String KEY_INITIAL_CREATOR = "initialCreator";
+ private static final String KEY_KEYWORD = "keyword";
+ private static final String KEY_LANGUAGE = "language";
+ private static final String KEY_PRINT_DATE = "printDate";
+ private static final String KEY_PRINTED_BY = "printedBy";
+ private static final String KEY_SUBJECT = "subject";
+ private static final String KEY_TITLE = "title";
+
+ public static String[] SUPPORTED_MIMETYPES = new String[] {
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT,
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE,
MimetypeMap.MIMETYPE_OPENDOCUMENT_GRAPHICS,
@@ -65,15 +93,18 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
MimetypeMap.MIMETYPE_OPENDOCUMENT_FORMULA_TEMPLATE,
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_MASTER,
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_WEB,
- MimetypeMap.MIMETYPE_OPENDOCUMENT_DATABASE, };
+ MimetypeMap.MIMETYPE_OPENDOCUMENT_DATABASE };
public OpenDocumentMetadataExtracter()
{
- super(new HashSet(Arrays.asList(mimeTypes)), 1.00, 1000);
+ super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES)));
}
- public void extractInternal(ContentReader reader, Map destination) throws Throwable
+ @Override
+ public Map extractRaw(ContentReader reader) throws Throwable
{
+ Map rawProperties = newRawMap();
+
ODFMetaFileAnalyzer analyzer = new ODFMetaFileAnalyzer();
InputStream is = null;
try
@@ -84,11 +115,34 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
if (docInfo != null)
{
- // set the metadata
- destination.put(ContentModel.PROP_AUTHOR, docInfo.getCreator());
- destination.put(ContentModel.PROP_TITLE, docInfo.getTitle());
- destination.put(ContentModel.PROP_DESCRIPTION, docInfo.getDescription());
- destination.put(ContentModel.PROP_CREATED, docInfo.getCreationDate());
+ putRawValue(KEY_CREATION_DATE, docInfo.getCreationDate(), rawProperties);
+ putRawValue(KEY_CREATOR, docInfo.getCreator(), rawProperties);
+ putRawValue(KEY_DATE, docInfo.getDate(), rawProperties);
+ putRawValue(KEY_DESCRIPTION, docInfo.getDescription(), rawProperties);
+ putRawValue(KEY_GENERATOR, docInfo.getGenerator(), rawProperties);
+ putRawValue(KEY_INITIAL_CREATOR, docInfo.getInitialCreator(), rawProperties);
+ putRawValue(KEY_KEYWORD, docInfo.getKeyword(), rawProperties);
+ putRawValue(KEY_LANGUAGE, docInfo.getLanguage(), rawProperties);
+ putRawValue(KEY_PRINT_DATE, docInfo.getPrintDate(), rawProperties);
+ putRawValue(KEY_PRINTED_BY, docInfo.getPrintedBy(), rawProperties);
+ putRawValue(KEY_SUBJECT, docInfo.getSubject(), rawProperties);
+ putRawValue(KEY_TITLE, docInfo.getTitle(), rawProperties);
+
+ // Handle user-defined properties dynamically
+ Map> mapping = super.getMapping();
+ Hashtable userDefinedProperties = docInfo.getUserDefined();
+ // Extract those user properties for which there is a mapping
+ for (String key : mapping.keySet())
+ {
+ if (userDefinedProperties.containsKey(key))
+ {
+ Object value = userDefinedProperties.get(key);
+ if (value != null && value instanceof Serializable)
+ {
+ putRawValue(key, (Serializable) value, rawProperties);
+ }
+ }
+ }
}
}
finally
@@ -98,5 +152,7 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
try { is.close(); } catch (IOException e) {}
}
}
+ // Done
+ return rawProperties;
}
}
\ No newline at end of file
diff --git a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.properties b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.properties
new file mode 100644
index 0000000000..a74de9d296
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.properties
@@ -0,0 +1,21 @@
+#
+# OpenDocumentMetadataExtracter - default mapping
+#
+# author: Derek Hulley
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+
+# Mappings
+creationDate=cm:created
+creator=cm:author
+date=
+description=
+generator=
+initialCreator=
+keyword=
+language=
+printDate=
+printedBy=
+subject=cm:description
+title=cm:title
diff --git a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracterTest.java
new file mode 100644
index 0000000000..5aa9fc4a29
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracterTest.java
@@ -0,0 +1,48 @@
+package org.alfresco.repo.content.metadata;
+
+
+/**
+ * @see OpenDocumentMetadataExtracter
+ *
+ * @author Derek Hulley
+ */
+public class OpenDocumentMetadataExtracterTest extends AbstractMetadataExtracterTest
+{
+ private OpenDocumentMetadataExtracter extracter;
+
+ @Override
+ public void setUp() throws Exception
+ {
+ super.setUp();
+ extracter = new OpenDocumentMetadataExtracter();
+ extracter.register();
+ }
+
+ /**
+ * @return Returns the same transformer regardless - it is allowed
+ */
+ protected MetadataExtracter getExtracter()
+ {
+ return extracter;
+ }
+
+ public void testSupports() throws Exception
+ {
+ for (String mimetype : OpenDocumentMetadataExtracter.SUPPORTED_MIMETYPES)
+ {
+ boolean supports = extracter.isSupported(mimetype);
+ assertTrue("Mimetype should be supported: " + mimetype, supports);
+ }
+ }
+
+ /**
+ * Test all the supported mimetypes
+ */
+ public void testSupportedMimetypes() throws Exception
+ {
+ for (String mimetype : OpenDocumentMetadataExtracter.SUPPORTED_MIMETYPES)
+ {
+ testExtractFromMimetype(mimetype);
+ }
+ }
+}
diff --git a/source/test-resources/quick/quick.odf b/source/test-resources/quick/quick.odf
index 963f9f77c6..a8987b6aad 100644
Binary files a/source/test-resources/quick/quick.odf and b/source/test-resources/quick/quick.odf differ
diff --git a/source/test-resources/quick/quick.odg b/source/test-resources/quick/quick.odg
index a383a283aa..3ab30c124f 100644
Binary files a/source/test-resources/quick/quick.odg and b/source/test-resources/quick/quick.odg differ
diff --git a/source/test-resources/quick/quick.otg b/source/test-resources/quick/quick.otg
new file mode 100644
index 0000000000..4e907fb79a
Binary files /dev/null and b/source/test-resources/quick/quick.otg differ
diff --git a/source/test-resources/quick/quick.otp b/source/test-resources/quick/quick.otp
new file mode 100644
index 0000000000..0acaf80c56
Binary files /dev/null and b/source/test-resources/quick/quick.otp differ
diff --git a/source/test-resources/quick/quick.ots b/source/test-resources/quick/quick.ots
new file mode 100644
index 0000000000..0d0f6bac1e
Binary files /dev/null and b/source/test-resources/quick/quick.ots differ
diff --git a/source/test-resources/quick/quick.ott b/source/test-resources/quick/quick.ott
new file mode 100644
index 0000000000..8e392f0cc8
Binary files /dev/null and b/source/test-resources/quick/quick.ott differ