diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml index 905158d40a..0ac102679f 100644 --- a/config/alfresco/content-services-context.xml +++ b/config/alfresco/content-services-context.xml @@ -210,6 +210,7 @@ + diff --git a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java index 290e659bdb..a209110ab1 100644 --- a/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java +++ b/source/java/org/alfresco/repo/content/ContentMinimalContextTestSuite.java @@ -25,6 +25,7 @@ import org.alfresco.repo.content.metadata.OfficeMetadataExtracterTest; import org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracterTest; import org.alfresco.repo.content.metadata.OpenOfficeMetadataExtracterTest; import org.alfresco.repo.content.metadata.PdfBoxMetadataExtracterTest; +import org.alfresco.repo.content.metadata.PoiMetadataExtracterTest; import org.alfresco.repo.content.metadata.RFC822MetadataExtracterTest; import org.alfresco.repo.content.transform.BinaryPassThroughContentTransformerTest; import org.alfresco.repo.content.transform.ComplexContentTransformerTest; @@ -86,6 +87,7 @@ public class ContentMinimalContextTestSuite extends TestSuite suite.addTestSuite( OpenDocumentMetadataExtracterTest.class ); suite.addTestSuite( OpenOfficeMetadataExtracterTest.class ); suite.addTestSuite( PdfBoxMetadataExtracterTest.class ); + suite.addTestSuite( PoiMetadataExtracterTest.class ); suite.addTestSuite( RFC822MetadataExtracterTest.class ); // Transform tests diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java index 4160920023..ca7969855c 100644 --- a/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/AbstractMetadataExtracterTest.java @@ -104,13 +104,8 @@ public abstract class AbstractMetadataExtracterTest extends TestCase Map properties = extractFromMimetype(mimetype); // check we got something - // Properties come back null-valued back for author, title, description for xlsx & pptx - if (mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET) == false && - mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION) == false) - { - assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype, - properties.isEmpty()); - } + assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype, + properties.isEmpty()); // check common metadata testCommonMetadata(mimetype, properties); @@ -174,24 +169,22 @@ public abstract class AbstractMetadataExtracterTest extends TestCase } // Title and description - if (mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET) || - mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION)) { - return; - } assertEquals( "Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype, QUICK_TITLE, DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE))); - assertEquals( - "Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype, - QUICK_DESCRIPTION, - DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION))); + if (!skipDescriptionCheck(mimetype)) { + assertEquals( + "Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype, + QUICK_DESCRIPTION, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION))); + } } protected abstract void testFileSpecificMetadata(String mimetype, Map properties); /** * This method can be overridden to cause the author/creator property check to be skipped. - * The default behaviour is for the check to be skipped for all MIME types. + * The default behaviour is for the check not to be skipped for all MIME types. * * @param mimetype * @return true to skip the checks, else false @@ -201,6 +194,18 @@ public abstract class AbstractMetadataExtracterTest extends TestCase return false; } + /** + * This method can be overridden to cause the description property check to be skipped. + * The default behaviour is for the check not to be skipped for all MIME types. + * + * @param mimetype + * @return true to skip the checks, else false + */ + protected boolean skipDescriptionCheck(String mimetype) + { + return false; + } + public void testZeroLengthFile() throws Exception { diff --git a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java index 0875071f95..0d54f017e3 100644 --- a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java @@ -46,10 +46,7 @@ public class OpenOfficeMetadataExtracter extends AbstractMappingMetadataExtracte MimetypeMap.MIMETYPE_STAROFFICE5_WRITER, MimetypeMap.MIMETYPE_STAROFFICE5_IMPRESS, MimetypeMap.MIMETYPE_OPENOFFICE1_WRITER, - MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS, - MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, - MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, - MimetypeMap.MIMETYPE_OPENXML_PRESENTATION + MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS }; private OpenOfficeMetadataWorker worker; diff --git a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java index b111565106..606b8eacb1 100644 --- a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracterTest.java @@ -112,10 +112,6 @@ public class OpenOfficeMetadataExtracterTest extends AbstractMetadataExtracterTe mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_STAROFFICE5_IMPRESS); mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS); - // The following do have them, but they are not being returned by OOo - mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET); - mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION); - return mimeTypesWithNoAuthor.contains(mimetype); } diff --git a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java new file mode 100644 index 0000000000..eb17b07a51 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.metadata; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Map; + +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.cmr.repository.ContentReader; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.poi.POIXMLDocument; +import org.apache.poi.POIXMLPropertiesTextExtractor; +import org.apache.poi.POIXMLProperties.CoreProperties; +import org.apache.poi.openxml4j.exceptions.OpenXML4JException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.xslf.XSLFSlideShow; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.xmlbeans.XmlException; + +/** + * POI-based metadata extractor for Office 07 documents. + * See http://poi.apache.org/ for information on POI. + *
+ *   author:                 --      cm:author
+ *   title:                  --      cm:title
+ *   subject:                --      cm:description
+ *   created:                --      cm:created
+ *   Any custom property:    --      [not mapped]
+ * 
+ * + * TIKA Note - all the fields (plus a few others) are present + * in the tika metadata. + * + * @author Neil McErlean + */ +public class PoiMetadataExtracter extends AbstractMappingMetadataExtracter +{ + protected static Log logger = LogFactory.getLog(PoiMetadataExtracter.class); + + private static final String KEY_AUTHOR = "author"; + private static final String KEY_TITLE = "title"; + private static final String KEY_SUBJECT = "subject"; + private static final String KEY_CREATED = "created"; + private static final String KEY_DESCRIPTION = "description"; + + public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, + MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, + MimetypeMap.MIMETYPE_OPENXML_PRESENTATION}; + + public PoiMetadataExtracter() + { + super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES))); + } + + @Override + public Map extractRaw(ContentReader reader) throws Throwable + { + Map rawProperties = newRawMap(); + + InputStream is = null; + try + { + is = reader.getContentInputStream(); + POIXMLDocument document = readDocumentFromStream(is, reader.getMimetype()); + + POIXMLPropertiesTextExtractor extracter = new POIXMLPropertiesTextExtractor(document); + CoreProperties coreProps = extracter.getCoreProperties(); + + putRawValue(KEY_AUTHOR, coreProps.getCreator(), rawProperties); + putRawValue(KEY_TITLE, coreProps.getTitle(), rawProperties); + putRawValue(KEY_SUBJECT, coreProps.getSubject(), rawProperties); + putRawValue(KEY_DESCRIPTION, coreProps.getDescription(), rawProperties); + putRawValue(KEY_CREATED, coreProps.getCreated(), rawProperties); + } + finally + { + if (is != null) + { + try { is.close(); } catch (IOException e) {} + } + } + + return rawProperties; + } + + private POIXMLDocument readDocumentFromStream(InputStream is, String mimetype) + throws IOException, OpenXML4JException, XmlException { + POIXMLDocument document = null; + if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype)) + { + document = new XWPFDocument(OPCPackage.open(is)); + } + else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype)) + { + document = new XSSFWorkbook(OPCPackage.open(is)); + } + else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype)) + { + document = new XSLFSlideShow(OPCPackage.open(is)); + } + + return document; + } +} diff --git a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.properties b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.properties new file mode 100644 index 0000000000..ebc8f9411c --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.properties @@ -0,0 +1,14 @@ +# +# PoiMetadataExtracter - default mapping +# +# author: Neil McErlean + +# Namespaces +namespace.prefix.cm=http://www.alfresco.org/model/content/1.0 + +# Mappings +author=cm:author +title=cm:title +description=cm:description +created=cm:created + diff --git a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java new file mode 100644 index 0000000000..22f5f6d6e6 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.metadata; + +import java.io.Serializable; +import java.util.Map; + +import org.alfresco.model.ContentModel; +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; +import org.alfresco.service.namespace.QName; + +/** + * @see org.alfresco.repo.content.metadata.PoiMetadataExtracter + * + * @author Neil McErlean + */ +public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest +{ + private PoiMetadataExtracter extracter; + + @Override + public void setUp() throws Exception + { + super.setUp(); + extracter = new PoiMetadataExtracter(); + extracter.setDictionaryService(dictionaryService); + extracter.register(); + } + + @Override + protected MetadataExtracter getExtracter() + { + return extracter; + } + + public void testSupports() throws Exception + { + for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES) + { + boolean supports = extracter.isSupported(mimetype); + assertTrue("Mimetype should be supported: " + mimetype, supports); + } + } + + public void testOffice2007Extraction() throws Exception + { + for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES) + { + testExtractFromMimetype(mimetype); + } + } + + @Override + protected boolean skipDescriptionCheck(String mimetype) { + // Our 3 OpenOffice 07 quick files have no description properties. + return true; + } + + + @Override + protected void testFileSpecificMetadata(String mimetype, + Map properties) { + // This test class is testing 3 files: quick.docx, quick.xlsx & quick.pptx. + // Their created times are hard-coded here for checking. + // Of course this means that if the files are updated, the test will break + // but those files are rarely modified - only added to. + if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype)) + { + checkFileCreationDate(mimetype, properties, "2010-01-06T17:32:00.000Z"); + } + else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype)) + { + checkFileCreationDate(mimetype, properties, "1996-10-14T23:33:28.000+01:00"); + } + else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype)) + { + // Extraordinary! This document predates Isaac Newton's Principia Mathematica by almost a century. ;) + checkFileCreationDate(mimetype, properties, "1601-01-01T00:00:00.000Z"); + } + } + + private void checkFileCreationDate(String mimetype, Map properties, String date) + { + assertEquals("Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, date, + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED))); + } +}