Proper fix for unreported issue with OOo-based extraction of Office 07 metadata.

Added a new metadata extractor based on POI for docx, xlsx and pptx mime types.
Changed OpenOfficeMetadataExtracter so that it no longer supports these mime types.
Added the new test code to ContentMinimalContextTestSuite

Some tidying up of code in AbstractMetadataExtracterTest and OpenOfficeMetadataExtracter to reflect the fact that this extractor does not handle these mime types any more.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@19792 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Neil McErlean
2010-04-09 12:10:06 +00:00
parent fa927055d9
commit de612572d9
8 changed files with 269 additions and 24 deletions

View File

@@ -210,6 +210,7 @@
<!-- Content Metadata Extractors --> <!-- Content Metadata Extractors -->
<bean id="extracter.PDFBox" class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" parent="baseMetadataExtracter" /> <bean id="extracter.PDFBox" class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.Poi" class="org.alfresco.repo.content.metadata.PoiMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.Office" class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" parent="baseMetadataExtracter" /> <bean id="extracter.Office" class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.Mail" class="org.alfresco.repo.content.metadata.MailMetadataExtracter" parent="baseMetadataExtracter" /> <bean id="extracter.Mail" class="org.alfresco.repo.content.metadata.MailMetadataExtracter" parent="baseMetadataExtracter" />
<bean id="extracter.Html" class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" parent="baseMetadataExtracter" /> <bean id="extracter.Html" class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" parent="baseMetadataExtracter" />

View File

@@ -25,6 +25,7 @@ import org.alfresco.repo.content.metadata.OfficeMetadataExtracterTest;
import org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracterTest; import org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracterTest;
import org.alfresco.repo.content.metadata.OpenOfficeMetadataExtracterTest; import org.alfresco.repo.content.metadata.OpenOfficeMetadataExtracterTest;
import org.alfresco.repo.content.metadata.PdfBoxMetadataExtracterTest; import org.alfresco.repo.content.metadata.PdfBoxMetadataExtracterTest;
import org.alfresco.repo.content.metadata.PoiMetadataExtracterTest;
import org.alfresco.repo.content.metadata.RFC822MetadataExtracterTest; import org.alfresco.repo.content.metadata.RFC822MetadataExtracterTest;
import org.alfresco.repo.content.transform.BinaryPassThroughContentTransformerTest; import org.alfresco.repo.content.transform.BinaryPassThroughContentTransformerTest;
import org.alfresco.repo.content.transform.ComplexContentTransformerTest; import org.alfresco.repo.content.transform.ComplexContentTransformerTest;
@@ -86,6 +87,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
suite.addTestSuite( OpenDocumentMetadataExtracterTest.class ); suite.addTestSuite( OpenDocumentMetadataExtracterTest.class );
suite.addTestSuite( OpenOfficeMetadataExtracterTest.class ); suite.addTestSuite( OpenOfficeMetadataExtracterTest.class );
suite.addTestSuite( PdfBoxMetadataExtracterTest.class ); suite.addTestSuite( PdfBoxMetadataExtracterTest.class );
suite.addTestSuite( PoiMetadataExtracterTest.class );
suite.addTestSuite( RFC822MetadataExtracterTest.class ); suite.addTestSuite( RFC822MetadataExtracterTest.class );
// Transform tests // Transform tests

View File

@@ -104,13 +104,8 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
Map<QName, Serializable> properties = extractFromMimetype(mimetype); Map<QName, Serializable> properties = extractFromMimetype(mimetype);
// check we got something // check we got something
// Properties come back null-valued back for author, title, description for xlsx & pptx
if (mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET) == false &&
mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION) == false)
{
assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype, assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype,
properties.isEmpty()); properties.isEmpty());
}
// check common metadata // check common metadata
testCommonMetadata(mimetype, properties); testCommonMetadata(mimetype, properties);
@@ -174,24 +169,22 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
} }
// Title and description // Title and description
if (mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET) ||
mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION)) {
return;
}
assertEquals( assertEquals(
"Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype, "Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
QUICK_TITLE, QUICK_TITLE,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE))); DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE)));
if (!skipDescriptionCheck(mimetype)) {
assertEquals( assertEquals(
"Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype, "Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype,
QUICK_DESCRIPTION, QUICK_DESCRIPTION,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION))); DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
} }
}
protected abstract void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties); protected abstract void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties);
/** /**
* This method can be overridden to cause the author/creator property check to be skipped. * This method can be overridden to cause the author/creator property check to be skipped.
* The default behaviour is for the check to be skipped for all MIME types. * The default behaviour is for the check not to be skipped for all MIME types.
* *
* @param mimetype * @param mimetype
* @return <code>true</code> to skip the checks, else <code>false</code> * @return <code>true</code> to skip the checks, else <code>false</code>
@@ -201,6 +194,18 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
return false; return false;
} }
/**
* This method can be overridden to cause the description property check to be skipped.
* The default behaviour is for the check not to be skipped for all MIME types.
*
* @param mimetype
* @return <code>true</code> to skip the checks, else <code>false</code>
*/
protected boolean skipDescriptionCheck(String mimetype)
{
return false;
}
public void testZeroLengthFile() throws Exception public void testZeroLengthFile() throws Exception
{ {

View File

@@ -46,10 +46,7 @@ public class OpenOfficeMetadataExtracter extends AbstractMappingMetadataExtracte
MimetypeMap.MIMETYPE_STAROFFICE5_WRITER, MimetypeMap.MIMETYPE_STAROFFICE5_WRITER,
MimetypeMap.MIMETYPE_STAROFFICE5_IMPRESS, MimetypeMap.MIMETYPE_STAROFFICE5_IMPRESS,
MimetypeMap.MIMETYPE_OPENOFFICE1_WRITER, MimetypeMap.MIMETYPE_OPENOFFICE1_WRITER,
MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS, MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS
MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING,
MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET,
MimetypeMap.MIMETYPE_OPENXML_PRESENTATION
}; };
private OpenOfficeMetadataWorker worker; private OpenOfficeMetadataWorker worker;

View File

@@ -112,10 +112,6 @@ public class OpenOfficeMetadataExtracterTest extends AbstractMetadataExtracterTe
mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_STAROFFICE5_IMPRESS); mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_STAROFFICE5_IMPRESS);
mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS); mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS);
// The following do have them, but they are not being returned by OOo
mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET);
mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION);
return mimeTypesWithNoAuthor.contains(mimetype); return mimeTypesWithNoAuthor.contains(mimetype);
} }

View File

@@ -0,0 +1,126 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLPropertiesTextExtractor;
import org.apache.poi.POIXMLProperties.CoreProperties;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.xmlbeans.XmlException;
/**
* POI-based metadata extractor for Office 07 documents.
* See http://poi.apache.org/ for information on POI.
* <pre>
* <b>author:</b> -- cm:author
* <b>title:</b> -- cm:title
* <b>subject:</b> -- cm:description
* <b>created:</b> -- cm:created
* <b>Any custom property:</b> -- [not mapped]
* </pre>
*
* TIKA Note - all the fields (plus a few others) are present
* in the tika metadata.
*
* @author Neil McErlean
*/
public class PoiMetadataExtracter extends AbstractMappingMetadataExtracter
{
protected static Log logger = LogFactory.getLog(PoiMetadataExtracter.class);
private static final String KEY_AUTHOR = "author";
private static final String KEY_TITLE = "title";
private static final String KEY_SUBJECT = "subject";
private static final String KEY_CREATED = "created";
private static final String KEY_DESCRIPTION = "description";
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING,
MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET,
MimetypeMap.MIMETYPE_OPENXML_PRESENTATION};
public PoiMetadataExtracter()
{
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
}
@Override
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
Map<String, Serializable> rawProperties = newRawMap();
InputStream is = null;
try
{
is = reader.getContentInputStream();
POIXMLDocument document = readDocumentFromStream(is, reader.getMimetype());
POIXMLPropertiesTextExtractor extracter = new POIXMLPropertiesTextExtractor(document);
CoreProperties coreProps = extracter.getCoreProperties();
putRawValue(KEY_AUTHOR, coreProps.getCreator(), rawProperties);
putRawValue(KEY_TITLE, coreProps.getTitle(), rawProperties);
putRawValue(KEY_SUBJECT, coreProps.getSubject(), rawProperties);
putRawValue(KEY_DESCRIPTION, coreProps.getDescription(), rawProperties);
putRawValue(KEY_CREATED, coreProps.getCreated(), rawProperties);
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
}
return rawProperties;
}
private POIXMLDocument readDocumentFromStream(InputStream is, String mimetype)
throws IOException, OpenXML4JException, XmlException {
POIXMLDocument document = null;
if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype))
{
document = new XWPFDocument(OPCPackage.open(is));
}
else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype))
{
document = new XSSFWorkbook(OPCPackage.open(is));
}
else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype))
{
document = new XSLFSlideShow(OPCPackage.open(is));
}
return document;
}
}

View File

@@ -0,0 +1,14 @@
#
# PoiMetadataExtracter - default mapping
#
# author: Neil McErlean
# Namespaces
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
# Mappings
author=cm:author
title=cm:title
description=cm:description
created=cm:created

View File

@@ -0,0 +1,104 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
import org.alfresco.service.namespace.QName;
/**
* @see org.alfresco.repo.content.metadata.PoiMetadataExtracter
*
* @author Neil McErlean
*/
public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
{
private PoiMetadataExtracter extracter;
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = new PoiMetadataExtracter();
extracter.setDictionaryService(dictionaryService);
extracter.register();
}
@Override
protected MetadataExtracter getExtracter()
{
return extracter;
}
public void testSupports() throws Exception
{
for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
{
boolean supports = extracter.isSupported(mimetype);
assertTrue("Mimetype should be supported: " + mimetype, supports);
}
}
public void testOffice2007Extraction() throws Exception
{
for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
{
testExtractFromMimetype(mimetype);
}
}
@Override
protected boolean skipDescriptionCheck(String mimetype) {
// Our 3 OpenOffice 07 quick files have no description properties.
return true;
}
@Override
protected void testFileSpecificMetadata(String mimetype,
Map<QName, Serializable> properties) {
// This test class is testing 3 files: quick.docx, quick.xlsx & quick.pptx.
// Their created times are hard-coded here for checking.
// Of course this means that if the files are updated, the test will break
// but those files are rarely modified - only added to.
if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype))
{
checkFileCreationDate(mimetype, properties, "2010-01-06T17:32:00.000Z");
}
else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype))
{
checkFileCreationDate(mimetype, properties, "1996-10-14T23:33:28.000+01:00");
}
else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype))
{
// Extraordinary! This document predates Isaac Newton's Principia Mathematica by almost a century. ;)
checkFileCreationDate(mimetype, properties, "1601-01-01T00:00:00.000Z");
}
}
private void checkFileCreationDate(String mimetype, Map<QName, Serializable> properties, String date)
{
assertEquals("Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, date,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
}
}