diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java index e1bbb74e7e..9e521623bd 100644 --- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java @@ -47,9 +47,10 @@ import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.cmr.repository.datatype.TypeConversionException; import org.alfresco.service.namespace.InvalidQNameException; import org.alfresco.service.namespace.QName; -import org.springframework.extensions.surf.util.ISO8601DateFormat; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.xmlbeans.impl.xb.xsdschema.All; +import org.springframework.extensions.surf.util.ISO8601DateFormat; /** * Support class for metadata extracters that support dynamic and config-driven @@ -106,7 +107,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac private Set supportedMimetypes; private OverwritePolicy overwritePolicy; private boolean failOnTypeConversion; - private Set supportedDateFormats = new HashSet(0); + protected Set supportedDateFormats = new HashSet(0); private Map> mapping; private boolean inheritDefaultMapping; @@ -266,6 +267,10 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac DateFormat df = new SimpleDateFormat(dateFormatStr); this.supportedDateFormats.add(df); + /** + * + */ + /** * Date format can be locale specific - make sure English format always works */ @@ -846,7 +851,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac /** * Convert a date String to a Date object */ - private Date makeDate(String dateStr) + protected Date makeDate(String dateStr) { Date date = null; try diff --git a/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java index e3cba8945f..77cabf33a0 100644 --- a/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java @@ -18,20 +18,14 @@ */ package org.alfresco.repo.content.metadata; -import java.io.IOException; -import java.io.InputStream; import java.io.Serializable; -import java.util.Arrays; -import java.util.HashSet; +import java.util.ArrayList; import java.util.Map; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentReader; import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; import org.apache.tika.parser.dwg.DWGParser; -import org.apache.tika.sax.BodyContentHandler; -import org.xml.sax.ContentHandler; /** @@ -47,64 +41,40 @@ import org.xml.sax.ContentHandler; * lastauthor: * * - * TIKA Note - this has been converted to deep-call into Tika. - * This will be replaced with proper calls to Tika at a later date. + * Uses Apache Tika * * @author Nick Burch */ -public class DWGMetadataExtracter extends AbstractMappingMetadataExtracter +public class DWGMetadataExtracter extends TikaPoweredMetadataExtracter { - private static final String KEY_AUTHOR = "author"; - private static final String KEY_COMMENT = "comment"; - private static final String KEY_DESCRIPTION = "description"; private static final String KEY_KEYWORD = "keyword"; private static final String KEY_LAST_AUTHOR = "lastAuthor"; - private static final String KEY_TITLE = "title"; - - public static String[] SUPPORTED_MIMETYPES = new String[] { - MimetypeMap.MIMETYPE_APP_DWG, - MimetypeMap.MIMETYPE_IMG_DWG, - "image/x-dwg", // Was used before IANA registration - }; + + public static ArrayList SUPPORTED_MIMETYPES = buildSupportedMimetypes( + new String[] { + MimetypeMap.MIMETYPE_APP_DWG, + MimetypeMap.MIMETYPE_IMG_DWG, + "image/x-dwg", // Was used before IANA registration + }, + new DWGParser() + ); public DWGMetadataExtracter() { - super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES))); + super(SUPPORTED_MIMETYPES); + } + + @Override + protected Map extractSpecific(Metadata metadata, + Map properties) { + putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties); + putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties); + System.err.println(properties); + return properties; } @Override - public Map extractRaw(ContentReader reader) throws Throwable - { - Map rawProperties = newRawMap(); - - InputStream is = null; - try - { - is = reader.getContentInputStream(); - - DWGParser dwgParser = new DWGParser(); - ContentHandler handler = new BodyContentHandler() ; - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - - dwgParser.parse(is, handler, metadata, context); - - putRawValue(KEY_AUTHOR, metadata.get(Metadata.AUTHOR), rawProperties); - putRawValue(KEY_COMMENT, metadata.get(Metadata.COMMENTS), rawProperties); - putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.DESCRIPTION), rawProperties); - putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), rawProperties); - putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), rawProperties); - putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.SUBJECT), rawProperties); - putRawValue(KEY_TITLE, metadata.get(Metadata.TITLE), rawProperties); - } - finally - { - if (is != null) - { - try { is.close(); } catch (IOException e) {} - } - } - // Done - return rawProperties; + protected Parser getParser() { + return new DWGParser(); } } diff --git a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java index 00d07c8341..bc972220cd 100644 --- a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java @@ -21,6 +21,7 @@ package org.alfresco.repo.content.metadata; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Map; @@ -45,7 +46,7 @@ import org.apache.poi.hsmf.MAPIMessage; * @since 2.1 * @author Kevin Roast */ -public class MailMetadataExtracter extends AbstractMappingMetadataExtracter +public class MailMetadataExtracter extends TikaPoweredMetadataExtracter { private static final String KEY_SENT_DATE = "sentDate"; private static final String KEY_ORIGINATOR = "originator"; @@ -53,11 +54,14 @@ public class MailMetadataExtracter extends AbstractMappingMetadataExtracter private static final String KEY_ADDRESSEES = "addressees"; private static final String KEY_SUBJECT = "subjectLine"; - public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_OUTLOOK_MSG}; + public static ArrayList SUPPORTED_MIMETYPES = buildSupportedMimetypes( + new String[] {MimetypeMap.MIMETYPE_OUTLOOK_MSG}, + null + ); public MailMetadataExtracter() { - super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES))); + super(SUPPORTED_MIMETYPES); } @Override diff --git a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java index ad14dbcdc8..a85657ac6c 100644 --- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java @@ -21,19 +21,15 @@ package org.alfresco.repo.content.metadata; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Map; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentIOException; -import org.alfresco.service.cmr.repository.ContentReader; -import org.apache.poi.hpsf.PropertySet; -import org.apache.poi.hpsf.PropertySetFactory; -import org.apache.poi.hpsf.SummaryInformation; -import org.apache.poi.poifs.eventfilesystem.POIFSReader; -import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; -import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.OfficeParser; /** * Office file format Metadata Extracter. This extracter uses the POI library to extract @@ -56,95 +52,63 @@ import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; * wordCount: * * - * TIKA Note - everything we currently have should be present - * in the metadata. - * - * @author Jesper Steen Møller + * Uses Apache Tika + * @author Derek Hulley + * @author Nick Burch */ -public class OfficeMetadataExtracter extends AbstractMappingMetadataExtracter +public class OfficeMetadataExtracter extends TikaPoweredMetadataExtracter { - public static final String KEY_AUTHOR = "author"; - public static final String KEY_TITLE = "title"; - public static final String KEY_SUBJECT = "subject"; public static final String KEY_CREATE_DATETIME = "createDateTime"; public static final String KEY_LAST_SAVE_DATETIME = "lastSaveDateTime"; - public static final String KEY_COMMENTS = "comments"; public static final String KEY_EDIT_TIME = "editTime"; public static final String KEY_FORMAT = "format"; public static final String KEY_KEYWORDS = "keywords"; public static final String KEY_LAST_AUTHOR = "lastAuthor"; public static final String KEY_LAST_PRINTED = "lastPrinted"; - public static final String KEY_OS_VERSION = "osVersion"; - public static final String KEY_THUMBNAIL = "thumbnail"; + public static final String KEY_OS_VERSION = "osVersion"; // TODO + public static final String KEY_THUMBNAIL = "thumbnail"; // TODO public static final String KEY_PAGE_COUNT = "pageCount"; + public static final String KEY_PARAGRAPH_COUNT = "paragraphCount"; public static final String KEY_WORD_COUNT = "wordCount"; - public static String[] SUPPORTED_MIMETYPES = new String[] { - MimetypeMap.MIMETYPE_WORD, - MimetypeMap.MIMETYPE_EXCEL, - MimetypeMap.MIMETYPE_PPT}; + public static ArrayList SUPPORTED_MIMETYPES = buildSupportedMimetypes( + new String[] { + MimetypeMap.MIMETYPE_WORD, + MimetypeMap.MIMETYPE_EXCEL, + MimetypeMap.MIMETYPE_PPT}, + new OfficeParser() + ); + static { + // Outlook has it's own one! + SUPPORTED_MIMETYPES.remove(MimetypeMap.MIMETYPE_OUTLOOK_MSG); + } public OfficeMetadataExtracter() { - super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES))); + super(SUPPORTED_MIMETYPES); + } + + @Override + protected Parser getParser() { + return new OfficeParser(); } @Override - protected Map extractRaw(ContentReader reader) throws Throwable - { - final Map rawProperties = newRawMap(); - - POIFSReaderListener readerListener = new POIFSReaderListener() - { - public void processPOIFSReaderEvent(final POIFSReaderEvent event) - { - try - { - PropertySet ps = PropertySetFactory.create(event.getStream()); - if (ps instanceof SummaryInformation) - { - SummaryInformation si = (SummaryInformation) ps; - - putRawValue(KEY_AUTHOR, si.getAuthor(), rawProperties); - putRawValue(KEY_TITLE, si.getTitle(), rawProperties); - putRawValue(KEY_SUBJECT, si.getSubject(), rawProperties); - putRawValue(KEY_CREATE_DATETIME, si.getCreateDateTime(), rawProperties); - putRawValue(KEY_LAST_SAVE_DATETIME, si.getLastSaveDateTime(), rawProperties); - putRawValue(KEY_COMMENTS, si.getComments(), rawProperties); - putRawValue(KEY_EDIT_TIME, si.getEditTime(), rawProperties); - putRawValue(KEY_FORMAT, si.getFormat(), rawProperties); - putRawValue(KEY_KEYWORDS, si.getKeywords(), rawProperties); - putRawValue(KEY_LAST_AUTHOR, si.getLastAuthor(), rawProperties); - putRawValue(KEY_LAST_PRINTED, si.getLastPrinted(), rawProperties); - putRawValue(KEY_OS_VERSION, si.getOSVersion(), rawProperties); - putRawValue(KEY_THUMBNAIL, si.getThumbnail(), rawProperties); - putRawValue(KEY_PAGE_COUNT, si.getPageCount(), rawProperties); - putRawValue(KEY_WORD_COUNT, si.getWordCount(), rawProperties); - } - } - catch (Exception ex) - { - throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex); - } - } - }; - - InputStream is = null; - try - { - is = reader.getContentInputStream(); - POIFSReader poiFSReader = new POIFSReader(); - poiFSReader.registerListener(readerListener, SummaryInformation.DEFAULT_STREAM_NAME); - poiFSReader.read(is); - } - finally - { - if (is != null) - { - try { is.close(); } catch (IOException e) {} - } - } - return rawProperties; + protected Map extractSpecific(Metadata metadata, + Map properties) { + putRawValue(KEY_CREATE_DATETIME, metadata.get(Metadata.CREATION_DATE), properties); + putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(Metadata.LAST_SAVED), properties); + putRawValue(KEY_EDIT_TIME, metadata.get(Metadata.EDIT_TIME), properties); + putRawValue(KEY_FORMAT, metadata.get(Metadata.FORMAT), properties); + putRawValue(KEY_KEYWORDS, metadata.get(Metadata.KEYWORDS), properties); + putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties); + putRawValue(KEY_LAST_PRINTED, metadata.get(Metadata.LAST_PRINTED), properties); +// putRawValue(KEY_OS_VERSION, metadata.get(Metadata.OS_VERSION), properties); +// putRawValue(KEY_THUMBNAIL, metadata.get(Metadata.THUMBNAIL), properties); + putRawValue(KEY_PAGE_COUNT, metadata.get(Metadata.PAGE_COUNT), properties); + putRawValue(KEY_PARAGRAPH_COUNT, metadata.get(Metadata.PARAGRAPH_COUNT), properties); + putRawValue(KEY_WORD_COUNT, metadata.get(Metadata.WORD_COUNT), properties); + return properties; } -} +} \ No newline at end of file diff --git a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java index e9ec7e6248..8b376cf1cb 100644 --- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracterTest.java @@ -132,16 +132,17 @@ public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest } // Now check the non-standard ones we added in at test time - assertTrue( - "Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype, - properties.containsKey(WORD_COUNT_TEST_PROPERTY) - ); assertTrue( "Test Property " + LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype, properties.containsKey(LAST_AUTHOR_TEST_PROPERTY) ); if(mimetype.equals(MimetypeMap.MIMETYPE_WORD)) { + assertTrue( + "Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype, + properties.containsKey(WORD_COUNT_TEST_PROPERTY) + ); + assertEquals( "Test Property " + WORD_COUNT_TEST_PROPERTY + " incorrect for mimetype " + mimetype, "9", @@ -151,15 +152,16 @@ public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest AbstractMetadataExtracterTest.QUICK_PREVIOUS_AUTHOR, DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(LAST_AUTHOR_TEST_PROPERTY))); } else if(mimetype.equals(MimetypeMap.MIMETYPE_EXCEL)) { - assertEquals( - "Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype, - "0", - DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(WORD_COUNT_TEST_PROPERTY))); assertEquals( "Test Property " + LAST_AUTHOR_TEST_PROPERTY + " not found for mimetype " + mimetype, AbstractMetadataExtracterTest.QUICK_PREVIOUS_AUTHOR, DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(LAST_AUTHOR_TEST_PROPERTY))); } else if(mimetype.equals(MimetypeMap.MIMETYPE_PPT)) { + assertTrue( + "Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype, + properties.containsKey(WORD_COUNT_TEST_PROPERTY) + ); + assertEquals( "Test Property " + WORD_COUNT_TEST_PROPERTY + " not found for mimetype " + mimetype, "9", diff --git a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java index eb17b07a51..965bd9d09c 100644 --- a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.java @@ -18,26 +18,13 @@ */ package org.alfresco.repo.content.metadata; -import java.io.IOException; -import java.io.InputStream; -import java.io.Serializable; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Map; +import java.util.ArrayList; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentReader; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.poi.POIXMLDocument; -import org.apache.poi.POIXMLPropertiesTextExtractor; -import org.apache.poi.POIXMLProperties.CoreProperties; -import org.apache.poi.openxml4j.exceptions.OpenXML4JException; -import org.apache.poi.openxml4j.opc.OPCPackage; -import org.apache.poi.xslf.XSLFSlideShow; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; -import org.apache.poi.xwpf.usermodel.XWPFDocument; -import org.apache.xmlbeans.XmlException; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; /** * POI-based metadata extractor for Office 07 documents. @@ -50,77 +37,28 @@ import org.apache.xmlbeans.XmlException; * Any custom property: -- [not mapped] * * - * TIKA Note - all the fields (plus a few others) are present - * in the tika metadata. + * Uses Apache Tika * * @author Neil McErlean */ -public class PoiMetadataExtracter extends AbstractMappingMetadataExtracter +public class PoiMetadataExtracter extends TikaPoweredMetadataExtracter { protected static Log logger = LogFactory.getLog(PoiMetadataExtracter.class); - private static final String KEY_AUTHOR = "author"; - private static final String KEY_TITLE = "title"; - private static final String KEY_SUBJECT = "subject"; - private static final String KEY_CREATED = "created"; - private static final String KEY_DESCRIPTION = "description"; - - public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, - MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, - MimetypeMap.MIMETYPE_OPENXML_PRESENTATION}; + public static ArrayList SUPPORTED_MIMETYPES = buildSupportedMimetypes( + new String[] {MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING, + MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET, + MimetypeMap.MIMETYPE_OPENXML_PRESENTATION}, + new OOXMLParser() + ); public PoiMetadataExtracter() { - super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES))); + super(SUPPORTED_MIMETYPES); } @Override - public Map extractRaw(ContentReader reader) throws Throwable - { - Map rawProperties = newRawMap(); - - InputStream is = null; - try - { - is = reader.getContentInputStream(); - POIXMLDocument document = readDocumentFromStream(is, reader.getMimetype()); - - POIXMLPropertiesTextExtractor extracter = new POIXMLPropertiesTextExtractor(document); - CoreProperties coreProps = extracter.getCoreProperties(); - - putRawValue(KEY_AUTHOR, coreProps.getCreator(), rawProperties); - putRawValue(KEY_TITLE, coreProps.getTitle(), rawProperties); - putRawValue(KEY_SUBJECT, coreProps.getSubject(), rawProperties); - putRawValue(KEY_DESCRIPTION, coreProps.getDescription(), rawProperties); - putRawValue(KEY_CREATED, coreProps.getCreated(), rawProperties); - } - finally - { - if (is != null) - { - try { is.close(); } catch (IOException e) {} - } - } - - return rawProperties; + protected Parser getParser() { + return new OOXMLParser(); } - - private POIXMLDocument readDocumentFromStream(InputStream is, String mimetype) - throws IOException, OpenXML4JException, XmlException { - POIXMLDocument document = null; - if (MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING.equals(mimetype)) - { - document = new XWPFDocument(OPCPackage.open(is)); - } - else if (MimetypeMap.MIMETYPE_OPENXML_SPREADSHEET.equals(mimetype)) - { - document = new XSSFWorkbook(OPCPackage.open(is)); - } - else if (MimetypeMap.MIMETYPE_OPENXML_PRESENTATION.equals(mimetype)) - { - document = new XSLFSlideShow(OPCPackage.open(is)); - } - - return document; - } } diff --git a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.properties b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.properties index ebc8f9411c..0211e61c8d 100644 --- a/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.properties +++ b/source/java/org/alfresco/repo/content/metadata/PoiMetadataExtracter.properties @@ -11,4 +11,3 @@ author=cm:author title=cm:title description=cm:description created=cm:created - diff --git a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java new file mode 100644 index 0000000000..d6d411611e --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java @@ -0,0 +1,208 @@ +/* + * Copyright (C) 2005-2010 Alfresco Software Limited. + * + * This file is part of Alfresco + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + */ +package org.alfresco.repo.content.metadata; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.text.DateFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashSet; +import java.util.Locale; +import java.util.Map; + +import org.alfresco.service.cmr.repository.ContentReader; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; + +/** + * The parent of all Metadata Extractors which use + * Apache Tika under the hood. + * This handles all the common parts of processing the + * files, and the common mappings. + * Individual extractors extend from this to do custom + * mappings. + + *
+ *   author:                 --      cm:author
+ *   title:                  --      cm:title
+ *   subject:                --      cm:description
+ *   created:                --      cm:created
+ *   comments:
+ * 
+ * + * @author Nick Burch + */ +public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetadataExtracter +{ + protected static Log logger = LogFactory.getLog(TikaPoweredMetadataExtracter.class); + + protected static final String KEY_AUTHOR = "author"; + protected static final String KEY_TITLE = "title"; + protected static final String KEY_SUBJECT = "subject"; + protected static final String KEY_CREATED = "created"; + protected static final String KEY_DESCRIPTION = "description"; + protected static final String KEY_COMMENTS = "comments"; + + private DateFormat[] tikaDateFormats; + + /** + * Builds up a list of supported mime types by merging an explicit + * list with any that Tika also claims to support + */ + protected static ArrayList buildSupportedMimetypes(String[] explicitTypes, Parser tikaParser) { + ArrayList types = new ArrayList(); + for(String type : explicitTypes) { + if(!types.contains(type)) { + types.add(type); + } + } + if(tikaParser != null) { + for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext())) { + String type = mt.toString(); + if(!types.contains(type)) { + types.add(type); + } + } + } + return types; + } + + public TikaPoweredMetadataExtracter(ArrayList supportedMimeTypes) + { + this(new HashSet(supportedMimeTypes)); + } + public TikaPoweredMetadataExtracter(HashSet supportedMimeTypes) + { + super(supportedMimeTypes); + + this.tikaDateFormats = new DateFormat[] { + new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ"), + new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US), + new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss"), + new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US), + new SimpleDateFormat("yyyy-MM-dd"), + new SimpleDateFormat("yyyy-MM-dd", Locale.US), + new SimpleDateFormat("EEE MMM dd hh:mm:ss zzz yyyy"), + new SimpleDateFormat("EEE MMM dd hh:mm:ss zzz yyyy", Locale.US) + }; + } + + /** + * Version which also tries the ISO-8601 formats (in order..), + * and similar formats, which Tika makes use of + */ + protected Date makeDate(String dateStr) { + // Try our formats first, in order + for(DateFormat df : this.tikaDateFormats) { + try + { + return df.parse(dateStr); + } + catch (ParseException ee) + { + // Didn't work + } + } + + // Fall back to the normal ones + return super.makeDate(dateStr); + } + + /** + * Does auto-detection to select the best Tika + * Parser. + * Implementations can override this if they + * know their specific implementations. + */ + protected Parser getParser() { + return null; + } + + /** + * Allows implementation specific mappings + * to be done. + */ + protected Map extractSpecific(Metadata metadata, Map properties) { + return properties; + } + + @Override + protected Map extractRaw(ContentReader reader) throws Throwable + { + Map rawProperties = newRawMap(); + + InputStream is = null; + try + { + is = reader.getContentInputStream(); + Parser parser = getParser(); + ContentHandler handler = new BodyContentHandler() ; + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + + parser.parse(is, handler, metadata, context); + + putRawValue(KEY_AUTHOR, metadata.get(Metadata.AUTHOR), rawProperties); + putRawValue(KEY_TITLE, metadata.get(Metadata.TITLE), rawProperties); + putRawValue(KEY_COMMENTS, metadata.get(Metadata.COMMENTS), rawProperties); + + // Not everything is as consisent about these two as you might hope + String subject = metadata.get(Metadata.SUBJECT); + String description = metadata.get(Metadata.DESCRIPTION); + if(subject != null && description != null) { + putRawValue(KEY_DESCRIPTION, description, rawProperties); + putRawValue(KEY_SUBJECT, subject, rawProperties); + } else if(subject != null) { + putRawValue(KEY_DESCRIPTION, subject, rawProperties); + putRawValue(KEY_SUBJECT, subject, rawProperties); + } else if(description != null) { + putRawValue(KEY_DESCRIPTION, description, rawProperties); + putRawValue(KEY_SUBJECT, description, rawProperties); + } + + // Try for the dates two different ways too + if(metadata.get(Metadata.CREATION_DATE) != null) { + putRawValue(KEY_CREATED, metadata.get(Metadata.CREATION_DATE), rawProperties); + } else if(metadata.get(Metadata.DATE) != null) { + putRawValue(KEY_CREATED, metadata.get(Metadata.DATE), rawProperties); + } + + rawProperties = extractSpecific(metadata, rawProperties); + } + finally + { + if (is != null) + { + try { is.close(); } catch (IOException e) {} + } + } + + return rawProperties; + } +} diff --git a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.properties b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.properties new file mode 100644 index 0000000000..b0cdc22aa5 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.properties @@ -0,0 +1,13 @@ +# +# TikaPoweredMetadataExtracter - default mapping +# +# author: Nick Burch + +# Namespaces +namespace.prefix.cm=http://www.alfresco.org/model/content/1.0 + +# Mappings +author=cm:author +title=cm:title +description=cm:description +created=cm:created