mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Proper fix for unreported issue with OOo-based extraction of Office 07 metadata.
Added a new metadata extractor based on POI for docx, xlsx and pptx mime types. Changed OpenOfficeMetadataExtracter so that it no longer supports these mime types. Added the new test code to ContentMinimalContextTestSuite Some tidying up of code in AbstractMetadataExtracterTest and OpenOfficeMetadataExtracter to reflect the fact that this extractor does not handle these mime types any more. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@19792 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -210,6 +210,7 @@
|
||||
|
||||
<!-- Content Metadata Extractors -->
|
||||
<bean id="extracter.PDFBox" class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.Poi" class="org.alfresco.repo.content.metadata.PoiMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.Office" class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.Mail" class="org.alfresco.repo.content.metadata.MailMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean id="extracter.Html" class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
|
@@ -25,6 +25,7 @@ import org.alfresco.repo.content.metadata.OfficeMetadataExtracterTest;
|
||||
import org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracterTest;
|
||||
import org.alfresco.repo.content.metadata.OpenOfficeMetadataExtracterTest;
|
||||
import org.alfresco.repo.content.metadata.PdfBoxMetadataExtracterTest;
|
||||
import org.alfresco.repo.content.metadata.PoiMetadataExtracterTest;
|
||||
import org.alfresco.repo.content.metadata.RFC822MetadataExtracterTest;
|
||||
import org.alfresco.repo.content.transform.BinaryPassThroughContentTransformerTest;
|
||||
import org.alfresco.repo.content.transform.ComplexContentTransformerTest;
|
||||
@@ -86,6 +87,7 @@ public class ContentMinimalContextTestSuite extends TestSuite
|
||||
suite.addTestSuite( OpenDocumentMetadataExtracterTest.class );
|
||||
suite.addTestSuite( OpenOfficeMetadataExtracterTest.class );
|
||||
suite.addTestSuite( PdfBoxMetadataExtracterTest.class );
|
||||
suite.addTestSuite( PoiMetadataExtracterTest.class );
|
||||
suite.addTestSuite( RFC822MetadataExtracterTest.class );
|
||||
|
||||
// Transform tests
|
||||
|
@@ -104,13 +104,8 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
|
||||
Map<QName, Serializable> properties = extractFromMimetype(mimetype);
|
||||
// check we got something
|
||||
|
||||
// Properties come back null-valued back for author, title, description for xlsx & pptx
|
||||
if (mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET) == false &&
|
||||
mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION) == false)
|
||||
{
|
||||
assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype,
|
||||
properties.isEmpty());
|
||||
}
|
||||
|
||||
// check common metadata
|
||||
testCommonMetadata(mimetype, properties);
|
||||
@@ -174,24 +169,22 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
|
||||
}
|
||||
|
||||
// Title and description
|
||||
if (mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET) ||
|
||||
mimetype.equals(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION)) {
|
||||
return;
|
||||
}
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
|
||||
QUICK_TITLE,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_TITLE)));
|
||||
if (!skipDescriptionCheck(mimetype)) {
|
||||
assertEquals(
|
||||
"Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype,
|
||||
QUICK_DESCRIPTION,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_DESCRIPTION)));
|
||||
}
|
||||
}
|
||||
protected abstract void testFileSpecificMetadata(String mimetype, Map<QName, Serializable> properties);
|
||||
|
||||
/**
|
||||
* This method can be overridden to cause the author/creator property check to be skipped.
|
||||
* The default behaviour is for the check to be skipped for all MIME types.
|
||||
* The default behaviour is for the check not to be skipped for all MIME types.
|
||||
*
|
||||
* @param mimetype
|
||||
* @return <code>true</code> to skip the checks, else <code>false</code>
|
||||
@@ -201,6 +194,18 @@ public abstract class AbstractMetadataExtracterTest extends TestCase
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method can be overridden to cause the description property check to be skipped.
|
||||
* The default behaviour is for the check not to be skipped for all MIME types.
|
||||
*
|
||||
* @param mimetype
|
||||
* @return <code>true</code> to skip the checks, else <code>false</code>
|
||||
*/
|
||||
protected boolean skipDescriptionCheck(String mimetype)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public void testZeroLengthFile() throws Exception
|
||||
{
|
||||
|
@@ -46,10 +46,7 @@ public class OpenOfficeMetadataExtracter extends AbstractMappingMetadataExtracte
|
||||
MimetypeMap.MIMETYPE_STAROFFICE5_WRITER,
|
||||
MimetypeMap.MIMETYPE_STAROFFICE5_IMPRESS,
|
||||
MimetypeMap.MIMETYPE_OPENOFFICE1_WRITER,
|
||||
MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS,
|
||||
MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING,
|
||||
MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET,
|
||||
MimetypeMap.MIMETYPE_OPENXML_PRESENTATION
|
||||
MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS
|
||||
};
|
||||
|
||||
private OpenOfficeMetadataWorker worker;
|
||||
|
@@ -112,10 +112,6 @@ public class OpenOfficeMetadataExtracterTest extends AbstractMetadataExtracterTe
|
||||
mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_STAROFFICE5_IMPRESS);
|
||||
mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENOFFICE1_IMPRESS);
|
||||
|
||||
// The following do have them, but they are not being returned by OOo
|
||||
mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET);
|
||||
mimeTypesWithNoAuthor.add(MimetypeMap.MIMETYPE_OPENXML_PRESENTATION);
|
||||
|
||||
return mimeTypesWithNoAuthor.contains(mimetype);
|
||||
}
|
||||
|
||||
|
@@ -0,0 +1,126 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.poi.POIXMLDocument;
|
||||
import org.apache.poi.POIXMLPropertiesTextExtractor;
|
||||
import org.apache.poi.POIXMLProperties.CoreProperties;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.xslf.XSLFSlideShow;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
|
||||
/**
|
||||
* POI-based metadata extractor for Office 07 documents.
|
||||
* See http://poi.apache.org/ for information on POI.
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>subject:</b> -- cm:description
|
||||
* <b>created:</b> -- cm:created
|
||||
* <b>Any custom property:</b> -- [not mapped]
|
||||
* </pre>
|
||||
*
|
||||
* TIKA Note - all the fields (plus a few others) are present
|
||||
* in the tika metadata.
|
||||
*
|
||||
* @author Neil McErlean
|
||||
*/
|
||||
public class PoiMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
{
|
||||
protected static Log logger = LogFactory.getLog(PoiMetadataExtracter.class);
|
||||
|
||||
private static final String KEY_AUTHOR = "author";
|
||||
private static final String KEY_TITLE = "title";
|
||||
private static final String KEY_SUBJECT = "subject";
|
||||
private static final String KEY_CREATED = "created";
|
||||
private static final String KEY_DESCRIPTION = "description";
|
||||
|
||||
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING,
|
||||
MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET,
|
||||
MimetypeMap.MIMETYPE_OPENXML_PRESENTATION};
|
||||
|
||||
public PoiMetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
||||
{
|
||||
Map<String, Serializable> rawProperties = newRawMap();
|
||||
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
POIXMLDocument document = readDocumentFromStream(is, reader.getMimetype());
|
||||
|
||||
POIXMLPropertiesTextExtractor extracter = new POIXMLPropertiesTextExtractor(document);
|
||||
CoreProperties coreProps = extracter.getCoreProperties();
|
||||
|
||||
putRawValue(KEY_AUTHOR, coreProps.getCreator(), rawProperties);
|
||||
putRawValue(KEY_TITLE, coreProps.getTitle(), rawProperties);
|
||||
putRawValue(KEY_SUBJECT, coreProps.getSubject(), rawProperties);
|
||||
putRawValue(KEY_DESCRIPTION, coreProps.getDescription(), rawProperties);
|
||||
putRawValue(KEY_CREATED, coreProps.getCreated(), rawProperties);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (IOException e) {}
|
||||
}
|
||||
}
|
||||
|
||||
return rawProperties;
|
||||
}
|
||||
|
||||
private POIXMLDocument readDocumentFromStream(InputStream is, String mimetype)
|
||||
throws IOException, OpenXML4JException, XmlException {
|
||||
POIXMLDocument document = null;
|
||||
if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype))
|
||||
{
|
||||
document = new XWPFDocument(OPCPackage.open(is));
|
||||
}
|
||||
else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype))
|
||||
{
|
||||
document = new XSSFWorkbook(OPCPackage.open(is));
|
||||
}
|
||||
else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype))
|
||||
{
|
||||
document = new XSLFSlideShow(OPCPackage.open(is));
|
||||
}
|
||||
|
||||
return document;
|
||||
}
|
||||
}
|
@@ -0,0 +1,14 @@
|
||||
#
|
||||
# PoiMetadataExtracter - default mapping
|
||||
#
|
||||
# author: Neil McErlean
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
description=cm:description
|
||||
created=cm:created
|
||||
|
@@ -0,0 +1,104 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.metadata.PoiMetadataExtracter
|
||||
*
|
||||
* @author Neil McErlean
|
||||
*/
|
||||
public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private PoiMetadataExtracter extracter;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception
|
||||
{
|
||||
super.setUp();
|
||||
extracter = new PoiMetadataExtracter();
|
||||
extracter.setDictionaryService(dictionaryService);
|
||||
extracter.register();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected MetadataExtracter getExtracter()
|
||||
{
|
||||
return extracter;
|
||||
}
|
||||
|
||||
public void testSupports() throws Exception
|
||||
{
|
||||
for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
|
||||
{
|
||||
boolean supports = extracter.isSupported(mimetype);
|
||||
assertTrue("Mimetype should be supported: " + mimetype, supports);
|
||||
}
|
||||
}
|
||||
|
||||
public void testOffice2007Extraction() throws Exception
|
||||
{
|
||||
for (String mimetype : PoiMetadataExtracter.SUPPORTED_MIMETYPES)
|
||||
{
|
||||
testExtractFromMimetype(mimetype);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean skipDescriptionCheck(String mimetype) {
|
||||
// Our 3 OpenOffice 07 quick files have no description properties.
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected void testFileSpecificMetadata(String mimetype,
|
||||
Map<QName, Serializable> properties) {
|
||||
// This test class is testing 3 files: quick.docx, quick.xlsx & quick.pptx.
|
||||
// Their created times are hard-coded here for checking.
|
||||
// Of course this means that if the files are updated, the test will break
|
||||
// but those files are rarely modified - only added to.
|
||||
if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype))
|
||||
{
|
||||
checkFileCreationDate(mimetype, properties, "2010-01-06T17:32:00.000Z");
|
||||
}
|
||||
else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype))
|
||||
{
|
||||
checkFileCreationDate(mimetype, properties, "1996-10-14T23:33:28.000+01:00");
|
||||
}
|
||||
else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype))
|
||||
{
|
||||
// Extraordinary! This document predates Isaac Newton's Principia Mathematica by almost a century. ;)
|
||||
checkFileCreationDate(mimetype, properties, "1601-01-01T00:00:00.000Z");
|
||||
}
|
||||
}
|
||||
|
||||
private void checkFileCreationDate(String mimetype, Map<QName, Serializable> properties, String date)
|
||||
{
|
||||
assertEquals("Property " + ContentModel.PROP_CREATED + " not found for mimetype " + mimetype, date,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_CREATED)));
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user