properties);
/**
* This method can be overridden to cause the author/creator property check to be skipped.
- * The default behaviour is for the check to be skipped for all MIME types.
+ * The default behaviour is for the check not to be skipped for all MIME types.
*
* @param mimetype
* @return true
to skip the checks, else false
@@ -201,6 +194,18 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
return false;
}
+ /**
+ * This method can be overridden to cause the description property check to be skipped.
+ * The default behaviour is for the check not to be skipped for all MIME types.
+ *
+ * @param mimetype
+ * @return true
to skip the checks, else false
+ */
+ protected boolean skipDescriptionCheck(String mimetype)
+ {
+ return false;
+ }
+
public void testZeroLengthFile() throws Exception
{
diff --git a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java
index 0875071f95..0d54f017e3 100644
--- a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java
+++ b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java
@@ -46,10 +46,7 @@ public class OpenOfficeMetadataExtracter extends AbstractMappingMetadataExtracte
MimetypeMap.MIMETYPE_STAROFFICE5_WRITER,
MimetypeMap.MIMETYPE_STAROFFICE5_IMPRESS,
MimetypeMap.MIMETYPE_OPENOFFICE1_WRITER,
- MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS,
- MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING,
- MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET,
- MimetypeMap.MIMETYPE_OPENXML_PRESENTATION
+ MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS
};
private OpenOfficeMetadataWorker worker;
diff --git a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java
index b111565106..606b8eacb1 100644
--- a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java
+++ b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java
@@ -112,10 +112,6 @@ public class OpenOfficeMetadataExtracterTest extends AbstractMetadataExtracterTe
mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_STAROFFICE5_IMPRESS);
mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS);
- // The following do have them, but they are not being returned by OOo
- mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET);
- mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION);
-
return mimeTypesWithNoAuthor.contains(mimetype);
}
diff --git a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java
new file mode 100644
index 0000000000..eb17b07a51
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see .
+ */
+package org.alfresco.repo.content.metadata;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Map;
+
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.ContentReader;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLPropertiesTextExtractor;
+import org.apache.poi.POIXMLProperties.CoreProperties;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.xslf.XSLFSlideShow;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.xmlbeans.XmlException;
+
+/**
+ * POI-based metadata extractor for Office 07 documents.
+ * See http://poi.apache.org/ for information on POI.
+ *
+ * author: -- cm:author
+ * title: -- cm:title
+ * subject: -- cm:description
+ * created: -- cm:created
+ * Any custom property: -- [not mapped]
+ *
+ *
+ * TIKA Note - all the fields (plus a few others) are present
+ * in the tika metadata.
+ *
+ * @author Neil McErlean
+ */
+public class PoiMetadataExtracter extends AbstractMappingMetadataExtracter
+{
+ protected static Log logger = LogFactory.getLog(PoiMetadataExtracter.class);
+
+ private static final String KEY_AUTHOR = "author";
+ private static final String KEY_TITLE = "title";
+ private static final String KEY_SUBJECT = "subject";
+ private static final String KEY_CREATED = "created";
+ private static final String KEY_DESCRIPTION = "description";
+
+ public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING,
+ MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET,
+ MimetypeMap.MIMETYPE_OPENXML_PRESENTATION};
+
+ public PoiMetadataExtracter()
+ {
+ super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES)));
+ }
+
+ @Override
+ public Map extractRaw(ContentReader reader) throws Throwable
+ {
+ Map rawProperties = newRawMap();
+
+ InputStream is = null;
+ try
+ {
+ is = reader.getContentInputStream();
+ POIXMLDocument document = readDocumentFromStream(is, reader.getMimetype());
+
+ POIXMLPropertiesTextExtractor extracter = new POIXMLPropertiesTextExtractor(document);
+ CoreProperties coreProps = extracter.getCoreProperties();
+
+ putRawValue(KEY_AUTHOR, coreProps.getCreator(), rawProperties);
+ putRawValue(KEY_TITLE, coreProps.getTitle(), rawProperties);
+ putRawValue(KEY_SUBJECT, coreProps.getSubject(), rawProperties);
+ putRawValue(KEY_DESCRIPTION, coreProps.getDescription(), rawProperties);
+ putRawValue(KEY_CREATED, coreProps.getCreated(), rawProperties);
+ }
+ finally
+ {
+ if (is != null)
+ {
+ try { is.close(); } catch (IOException e) {}
+ }
+ }
+
+ return rawProperties;
+ }
+
+ private POIXMLDocument readDocumentFromStream(InputStream is, String mimetype)
+ throws IOException, OpenXML4JException, XmlException {
+ POIXMLDocument document = null;
+ if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype))
+ {
+ document = new XWPFDocument(OPCPackage.open(is));
+ }
+ else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype))
+ {
+ document = new XSSFWorkbook(OPCPackage.open(is));
+ }
+ else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype))
+ {
+ document = new XSLFSlideShow(OPCPackage.open(is));
+ }
+
+ return document;
+ }
+}
diff --git a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.properties b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.properties
new file mode 100644
index 0000000000..ebc8f9411c
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.properties
@@ -0,0 +1,14 @@
+#
+# PoiMetadataExtracter - default mapping
+#
+# author: Neil McErlean
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+
+# Mappings
+author=cm:author
+title=cm:title
+description=cm:description
+created=cm:created
+
diff --git a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java
new file mode 100644
index 0000000000..22f5f6d6e6
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see .
+ */
+package org.alfresco.repo.content.metadata;
+
+import java.io.Serializable;
+import java.util.Map;
+
+import org.alfresco.model.ContentModel;
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
+import org.alfresco.service.namespace.QName;
+
+/**
+ * @see org.alfresco.repo.content.metadata.PoiMetadataExtracter
+ *
+ * @author Neil McErlean
+ */
+public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
+{
+ private PoiMetadataExtracter extracter;
+
+ @Override
+ public void setUp() throws Exception
+ {
+ super.setUp();
+ extracter = new PoiMetadataExtracter();
+ extracter.setDictionaryService(dictionaryService);
+ extracter.register();
+ }
+
+ @Override
+ protected MetadataExtracter getExtracter()
+ {
+ return extracter;
+ }
+
+ public void testSupports() throws Exception
+ {
+ for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
+ {
+ boolean supports = extracter.isSupported(mimetype);
+ assertTrue("Mimetype should be supported: " + mimetype, supports);
+ }
+ }
+
+ public void testOffice2007Extraction() throws Exception
+ {
+ for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
+ {
+ testExtractFromMimetype(mimetype);
+ }
+ }
+
+ @Override
+ protected boolean skipDescriptionCheck(String mimetype) {
+ // Our 3 OpenOffice 07 quick files have no description properties.
+ return true;
+ }
+
+
+ @Override
+ protected void testFileSpecificMetadata(String mimetype,
+ Map properties) {
+ // This test class is testing 3 files: quick.docx, quick.xlsx & quick.pptx.
+ // Their created times are hard-coded here for checking.
+ // Of course this means that if the files are updated, the test will break
+ // but those files are rarely modified - only added to.
+ if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype))
+ {
+ checkFileCreationDate(mimetype, properties, "2010-01-06T17:32:00.000Z");
+ }
+ else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype))
+ {
+ checkFileCreationDate(mimetype, properties, "1996-10-14T23:33:28.000+01:00");
+ }
+ else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype))
+ {
+ // Extraordinary! This document predates Isaac Newton's Principia Mathematica by almost a century. ;)
+ checkFileCreationDate(mimetype, properties, "1601-01-01T00:00:00.000Z");
+ }
+ }
+
+ private void checkFileCreationDate(String mimetype, Map properties, String date)
+ {
+ assertEquals("Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, date,
+ DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
+ }
+}