diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java index ed80676a2a..dd91e4d116 100644 --- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java @@ -26,6 +26,7 @@ package org.alfresco.repo.content.metadata; import java.io.InputStream; import java.io.Serializable; +import java.lang.reflect.Array; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -62,7 +63,7 @@ import org.apache.commons.logging.LogFactory; *
  • * Implement the {@link extractInternal} method. This now returns a raw map of extracted * values keyed by document-specific property names. The trimPut method has - * been replaced with an equivalent {@link #putSafeRawValue(String, Object, Map)}. + * been replaced with an equivalent {@link #putRawValue(String, Serializable, Map)}. *
  • *
  • * Provide the default mapping of the document-specific properties to system-specific @@ -241,6 +242,28 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac mapping = readMappingProperties(mappingProperties); } + /** + * Helper method for derived classes to obtain the mappings that will be applied to raw + * values. This should be called after initialization in order to guarantee the complete + * map is given. + *

    + * Normally, the list of properties that can be extracted from a document is fixed and + * well-known - in that case, just extract everything. But Some implementations may have + * an extra, indeterminate set of values available for extraction. If the extraction of + * these runtime parameters is expensive, then the keys provided by the return value can + * be used to extract values from the documents. The metadata extraction becomes fully + * configuration-driven, i.e. declaring further mappings will result in more values being + * extracted from the documents. + */ + protected final Map> getMapping() + { + if (!initialized) + { + throw new UnsupportedOperationException("The complete mapping is only available after initialization."); + } + return Collections.unmodifiableMap(mapping); + } + /** * A utility method to read mapping properties from a resource file and convert to the map form. * @@ -566,17 +589,26 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } return systemProperties; } - + /** - * Examines a value or string for nulls and adds it to the map (if non-empty). If the value - * is non-Serializable, then the toString representation used directly. + * Adds a value to the map if it is non-trivial. A value is trivial if + *

    + * String values are trimmed before being put into the map. + * Otherwise, it is up to the extracter to ensure that the value is a Serializable. + * It is not appropriate to implicitly convert values in order to make them Serializable + * - the best conversion method will depend on the value's specific meaning. * - * @param key the destination map key - * @param value the value to check and put. - * @param destination map to put values into - * @return Returns true if set, false otherwise + * @param key the destination key + * @param value the serializable value + * @param destination the map to put values into + * @return Returns true if set, otherwise false */ - protected boolean putSafeRawValue(String key, Object value, Map destination) + protected boolean putRawValue(String key, Serializable value, Map destination) { if (value == null) { @@ -584,25 +616,47 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } if (value instanceof String) { - String svalue = ((String) value).trim(); - if (svalue.length() > 0) + String valueStr = ((String) value).trim(); + if (valueStr.length() == 0) { - destination.put(key, svalue); - return true; + return false; + } + else + { + // Keep the trimmed value + value = valueStr; } - return false; } - else if (value instanceof Serializable) + else if (value instanceof Collection) { - destination.put(key, (Serializable) value); + Collection valueCollection = (Collection) value; + if (valueCollection.isEmpty()) + { + return false; + } } - else + else if (value.getClass().isArray()) { - destination.put(key, value.toString()); + if (Array.getLength(value) == 0) + { + return false; + } } + // It passed all the tests + destination.put(key, value); return true; } + /** + * Helper method to fetch a clean map into which raw values can be dumped. + * + * @return Returns an empty map + */ + protected final Map newRawMap() + { + return new HashMap(17); + } + /** * This method provides a best guess of where to store the values extracted * from the documents. The list of properties mapped by default need not diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java index 526e3ef48c..249c5fc709 100644 --- a/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java @@ -86,9 +86,16 @@ public abstract class AbstractMetadataExtracterTest extends TestCase protected void testExtractFromMimetype(String mimetype) throws Exception { - Map properties = extractFromMimetype(mimetype); - // check - testCommonMetadata(mimetype, properties); + try + { + Map properties = extractFromMimetype(mimetype); + // check + testCommonMetadata(mimetype, properties); + } + catch (FileNotFoundException e) + { + // The test file is not there. We won't fail it. + } } protected Map extractFromMimetype(String mimetype) throws Exception diff --git a/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java index 8759f2f078..96a0e35afb 100644 --- a/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java @@ -45,6 +45,7 @@ public interface MetadataExtracter * written into the property map or not. * * @author Derek Hulley + * @author Jesper Steen Møller */ public enum OverwritePolicy { diff --git a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java index 4f2b36f091..9b878b7917 100644 --- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java @@ -28,7 +28,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.util.Arrays; -import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -43,18 +42,46 @@ import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; /** - * Office file format Metadata Extracter + * Office file format Metadata Extracter. This extracter uses the POI library to extract + * the following: + *
    + *   author:             --      cm:author
    + *   title:              --      cm:title
    + *   subject:            --      cm:description
    + *   createDateTime:     --      cm:created
    + *   lastSaveDateTime:   --      cm:modified
    + *   comments:
    + *   editTime:
    + *   format:
    + *   keywords:
    + *   lastAuthor:
    + *   lastPrinted:
    + *   osVersion:
    + *   thumbnail:
    + *   pageCount:
    + *   wordCount:
    + * 
    * * @author Jesper Steen Møller * @author Derek Hulley */ public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter { - public static final String PROP_AUTHOR = "author"; - public static final String PROP_TITLE = "title"; - public static final String PROP_SUBJECT = "subject"; - public static final String PROP_CREATE_DATETIME = "createDateTime"; - public static final String PROP_LAST_SAVE_DATETIME = "lastSaveDateTime"; + public static final String KEY_AUTHOR = "author"; + public static final String KEY_TITLE = "title"; + public static final String KEY_SUBJECT = "subject"; + public static final String KEY_CREATE_DATETIME = "createDateTime"; + public static final String KEY_LAST_SAVE_DATETIME = "lastSaveDateTime"; + public static final String KEY_COMMENTS = "comments"; + public static final String KEY_EDIT_TIME = "editTime"; + public static final String KEY_FORMAT = "format"; + public static final String KEY_KEYWORDS = "keywords"; + public static final String KEY_LAST_AUTHOR = "lastAuthor"; + public static final String KEY_LAST_PRINTED = "lastPrinted"; + public static final String KEY_OS_VERSION = "osVersion"; + public static final String KEY_THUMBNAIL = "thumbnail"; + public static final String KEY_PAGE_COUNT = "pageCount"; + public static final String KEY_WORD_COUNT = "wordCount"; public static String[] SUPPORTED_MIMETYPES = new String[] { MimetypeMap.MIMETYPE_WORD, @@ -69,7 +96,7 @@ public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter @Override protected Map extractRaw(ContentReader reader) throws Throwable { - final Map rawProperties = new HashMap(17); + final Map rawProperties = newRawMap(); POIFSReaderListener readerListener = new POIFSReaderListener() { @@ -82,11 +109,21 @@ public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter { SummaryInformation si = (SummaryInformation) ps; - putSafeRawValue(PROP_AUTHOR, si.getAuthor(), rawProperties); - putSafeRawValue(PROP_TITLE, si.getTitle(), rawProperties); - putSafeRawValue(PROP_SUBJECT, si.getSubject(), rawProperties); - putSafeRawValue(PROP_CREATE_DATETIME, si.getCreateDateTime(), rawProperties); - putSafeRawValue(PROP_LAST_SAVE_DATETIME, si.getLastSaveDateTime(), rawProperties); + putRawValue(KEY_AUTHOR, si.getAuthor(), rawProperties); + putRawValue(KEY_TITLE, si.getTitle(), rawProperties); + putRawValue(KEY_SUBJECT, si.getSubject(), rawProperties); + putRawValue(KEY_CREATE_DATETIME, si.getCreateDateTime(), rawProperties); + putRawValue(KEY_LAST_SAVE_DATETIME, si.getLastSaveDateTime(), rawProperties); + putRawValue(KEY_COMMENTS, si.getComments(), rawProperties); + putRawValue(KEY_EDIT_TIME, si.getEditTime(), rawProperties); + putRawValue(KEY_FORMAT, si.getFormat(), rawProperties); + putRawValue(KEY_KEYWORDS, si.getKeywords(), rawProperties); + putRawValue(KEY_LAST_AUTHOR, si.getLastAuthor(), rawProperties); + putRawValue(KEY_LAST_PRINTED, si.getLastPrinted(), rawProperties); + putRawValue(KEY_OS_VERSION, si.getOSVersion(), rawProperties); + putRawValue(KEY_THUMBNAIL, si.getThumbnail(), rawProperties); + putRawValue(KEY_PAGE_COUNT, si.getPageCount(), rawProperties); + putRawValue(KEY_WORD_COUNT, si.getWordCount(), rawProperties); } } catch (Exception ex) diff --git a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java index f31ab28116..70bd2dd344 100644 --- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java @@ -2,7 +2,7 @@ package org.alfresco.repo.content.metadata; /** - * @see org.alfresco.repo.content.transform.OfficeMetadataExtracter + * @see OfficeMetadataExtracter * * @author Jesper Steen Møller */ @@ -26,12 +26,12 @@ public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest return extracter; } - public void testReliability() throws Exception + public void testSupports() throws Exception { for (String mimetype : OfficeMetadataExtracter.SUPPORTED_MIMETYPES) { - double reliability = extracter.getReliability(mimetype); - assertTrue("Expected above zero reliability", reliability > 0.0); + boolean supports = extracter.isSupported(mimetype); + assertTrue("Mimetype should be supported: " + mimetype, supports); } } diff --git a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java index bf88f49bb3..039fc249c0 100644 --- a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java @@ -29,9 +29,10 @@ import java.io.InputStream; import java.io.Serializable; import java.util.Arrays; import java.util.HashSet; +import java.util.Hashtable; import java.util.Map; +import java.util.Set; -import org.alfresco.model.ContentModel; import org.alfresco.repo.content.MimetypeMap; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.namespace.QName; @@ -43,12 +44,39 @@ import com.catcode.odf.OpenDocumentMetadata; * Metadata extractor for the * {@link org.alfresco.repo.content.MimetypeMap#MIMETYPE_OPENDOCUMENT_TEXT MIMETYPE_OPENDOCUMENT_XXX} * mimetypes. + *
    + *   creationDate:           --      cm:created
    + *   creator:                --      cm:author
    + *   date:
    + *   description:            --      cm:description
    + *   generator:
    + *   initialCreator:
    + *   keyword:
    + *   language:
    + *   printDate:
    + *   printedBy:
    + *   subject:
    + *   title:                  --      cm:title
    + * 
    * * @author Antti Jokipii */ -public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter +public class OpenDocumentMetadataExtracter extends AbstractMappingMetadataExtracter { - private static String[] mimeTypes = new String[] { + private static final String KEY_CREATION_DATE = "creationDate"; + private static final String KEY_CREATOR = "creator"; + private static final String KEY_DATE = "date"; + private static final String KEY_DESCRIPTION = "description"; + private static final String KEY_GENERATOR = "generator"; + private static final String KEY_INITIAL_CREATOR = "initialCreator"; + private static final String KEY_KEYWORD = "keyword"; + private static final String KEY_LANGUAGE = "language"; + private static final String KEY_PRINT_DATE = "printDate"; + private static final String KEY_PRINTED_BY = "printedBy"; + private static final String KEY_SUBJECT = "subject"; + private static final String KEY_TITLE = "title"; + + public static String[] SUPPORTED_MIMETYPES = new String[] { MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT, MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE, MimetypeMap.MIMETYPE_OPENDOCUMENT_GRAPHICS, @@ -65,15 +93,18 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter MimetypeMap.MIMETYPE_OPENDOCUMENT_FORMULA_TEMPLATE, MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_MASTER, MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_WEB, - MimetypeMap.MIMETYPE_OPENDOCUMENT_DATABASE, }; + MimetypeMap.MIMETYPE_OPENDOCUMENT_DATABASE }; public OpenDocumentMetadataExtracter() { - super(new HashSet(Arrays.asList(mimeTypes)), 1.00, 1000); + super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES))); } - public void extractInternal(ContentReader reader, Map destination) throws Throwable + @Override + public Map extractRaw(ContentReader reader) throws Throwable { + Map rawProperties = newRawMap(); + ODFMetaFileAnalyzer analyzer = new ODFMetaFileAnalyzer(); InputStream is = null; try @@ -84,11 +115,34 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter if (docInfo != null) { - // set the metadata - destination.put(ContentModel.PROP_AUTHOR, docInfo.getCreator()); - destination.put(ContentModel.PROP_TITLE, docInfo.getTitle()); - destination.put(ContentModel.PROP_DESCRIPTION, docInfo.getDescription()); - destination.put(ContentModel.PROP_CREATED, docInfo.getCreationDate()); + putRawValue(KEY_CREATION_DATE, docInfo.getCreationDate(), rawProperties); + putRawValue(KEY_CREATOR, docInfo.getCreator(), rawProperties); + putRawValue(KEY_DATE, docInfo.getDate(), rawProperties); + putRawValue(KEY_DESCRIPTION, docInfo.getDescription(), rawProperties); + putRawValue(KEY_GENERATOR, docInfo.getGenerator(), rawProperties); + putRawValue(KEY_INITIAL_CREATOR, docInfo.getInitialCreator(), rawProperties); + putRawValue(KEY_KEYWORD, docInfo.getKeyword(), rawProperties); + putRawValue(KEY_LANGUAGE, docInfo.getLanguage(), rawProperties); + putRawValue(KEY_PRINT_DATE, docInfo.getPrintDate(), rawProperties); + putRawValue(KEY_PRINTED_BY, docInfo.getPrintedBy(), rawProperties); + putRawValue(KEY_SUBJECT, docInfo.getSubject(), rawProperties); + putRawValue(KEY_TITLE, docInfo.getTitle(), rawProperties); + + // Handle user-defined properties dynamically + Map> mapping = super.getMapping(); + Hashtable userDefinedProperties = docInfo.getUserDefined(); + // Extract those user properties for which there is a mapping + for (String key : mapping.keySet()) + { + if (userDefinedProperties.containsKey(key)) + { + Object value = userDefinedProperties.get(key); + if (value != null && value instanceof Serializable) + { + putRawValue(key, (Serializable) value, rawProperties); + } + } + } } } finally @@ -98,5 +152,7 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter try { is.close(); } catch (IOException e) {} } } + // Done + return rawProperties; } } \ No newline at end of file diff --git a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.properties b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.properties new file mode 100644 index 0000000000..a74de9d296 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.properties @@ -0,0 +1,21 @@ +# +# OpenDocumentMetadataExtracter - default mapping +# +# author: Derek Hulley + +# Namespaces +namespace.prefix.cm=http://www.alfresco.org/model/content/1.0 + +# Mappings +creationDate=cm:created +creator=cm:author +date= +description= +generator= +initialCreator= +keyword= +language= +printDate= +printedBy= +subject=cm:description +title=cm:title diff --git a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracterTest.java new file mode 100644 index 0000000000..5aa9fc4a29 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracterTest.java @@ -0,0 +1,48 @@ +package org.alfresco.repo.content.metadata; + + +/** + * @see OpenDocumentMetadataExtracter + * + * @author Derek Hulley + */ +public class OpenDocumentMetadataExtracterTest extends AbstractMetadataExtracterTest +{ + private OpenDocumentMetadataExtracter extracter; + + @Override + public void setUp() throws Exception + { + super.setUp(); + extracter = new OpenDocumentMetadataExtracter(); + extracter.register(); + } + + /** + * @return Returns the same transformer regardless - it is allowed + */ + protected MetadataExtracter getExtracter() + { + return extracter; + } + + public void testSupports() throws Exception + { + for (String mimetype : OpenDocumentMetadataExtracter.SUPPORTED_MIMETYPES) + { + boolean supports = extracter.isSupported(mimetype); + assertTrue("Mimetype should be supported: " + mimetype, supports); + } + } + + /** + * Test all the supported mimetypes + */ + public void testSupportedMimetypes() throws Exception + { + for (String mimetype : OpenDocumentMetadataExtracter.SUPPORTED_MIMETYPES) + { + testExtractFromMimetype(mimetype); + } + } +} diff --git a/source/test-resources/quick/quick.odf b/source/test-resources/quick/quick.odf index 963f9f77c6..a8987b6aad 100644 Binary files a/source/test-resources/quick/quick.odf and b/source/test-resources/quick/quick.odf differ diff --git a/source/test-resources/quick/quick.odg b/source/test-resources/quick/quick.odg index a383a283aa..3ab30c124f 100644 Binary files a/source/test-resources/quick/quick.odg and b/source/test-resources/quick/quick.odg differ diff --git a/source/test-resources/quick/quick.otg b/source/test-resources/quick/quick.otg new file mode 100644 index 0000000000..4e907fb79a Binary files /dev/null and b/source/test-resources/quick/quick.otg differ diff --git a/source/test-resources/quick/quick.otp b/source/test-resources/quick/quick.otp new file mode 100644 index 0000000000..0acaf80c56 Binary files /dev/null and b/source/test-resources/quick/quick.otp differ diff --git a/source/test-resources/quick/quick.ots b/source/test-resources/quick/quick.ots new file mode 100644 index 0000000000..0d0f6bac1e Binary files /dev/null and b/source/test-resources/quick/quick.ots differ diff --git a/source/test-resources/quick/quick.ott b/source/test-resources/quick/quick.ott new file mode 100644 index 0000000000..8e392f0cc8 Binary files /dev/null and b/source/test-resources/quick/quick.ott differ