/* * Copyright (C) 2005-2014 Alfresco Software Limited. * * This file is part of Alfresco * * Alfresco is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Alfresco is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Alfresco. If not, see . */ package org.alfresco.repo.content.metadata; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Locale; import java.util.Map; import java.util.Set; import org.alfresco.api.AlfrescoPublicApi; import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.filestore.FileContentReader; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.cmr.repository.datatype.TypeConversionException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.poi.patch.AlfrescoPoiPatchUtils; import org.apache.tika.embedder.Embedder; import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.ContentHandlerDecorator; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.sax.xpath.Matcher; import org.apache.tika.sax.xpath.MatchingContentHandler; import org.apache.tika.sax.xpath.XPathParser; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.joda.time.format.DateTimeFormatterBuilder; import org.joda.time.format.DateTimeParser; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; /** * The parent of all Metadata Extractors which use * Apache Tika under the hood. This handles all the * common parts of processing the files, and the common * mappings. Individual extractors extend from this * to do custom mappings. *

 *   author:                 --      cm:author
 *   title:                  --      cm:title
 *   subject:                --      cm:description
 *   created:                --      cm:created
 *   comments:
 *

* * @since 3.4 * @author Nick Burch */ @AlfrescoPublicApi public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetadataExtracter implements MetadataEmbedder { protected static Log logger = LogFactory.getLog(TikaPoweredMetadataExtracter.class); protected static final String KEY_AUTHOR = "author"; protected static final String KEY_TITLE = "title"; protected static final String KEY_SUBJECT = "subject"; protected static final String KEY_CREATED = "created"; protected static final String KEY_DESCRIPTION = "description"; protected static final String KEY_COMMENTS = "comments"; private DateTimeFormatter tikaUTCDateFormater; private DateTimeFormatter tikaDateFormater; protected DocumentSelector documentSelector; private String extractorContext = null; /** * Builds up a list of supported mime types by merging * an explicit list with any that Tika also claims to support */ protected static ArrayList buildSupportedMimetypes(String[] explicitTypes, Parser... tikaParsers) { ArrayList types = new ArrayList(); for(String type : explicitTypes) { if(!types.contains(type)) { types.add(type); } } if(tikaParsers != null) { for(Parser tikaParser : tikaParsers) { for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext())) { String type = mt.toString(); if(!types.contains(type)) { types.add(type); } } } } return types; } public TikaPoweredMetadataExtracter(String extractorContext, ArrayList supportedMimeTypes) { this(extractorContext, new HashSet(supportedMimeTypes), null); } public TikaPoweredMetadataExtracter(ArrayList supportedMimeTypes) { this(null, new HashSet(supportedMimeTypes), null); } public TikaPoweredMetadataExtracter(ArrayList supportedMimeTypes, ArrayList supportedEmbedMimeTypes) { this(null, new HashSet(supportedMimeTypes), new HashSet(supportedEmbedMimeTypes)); } public TikaPoweredMetadataExtracter(HashSet supportedMimeTypes) { this(null, supportedMimeTypes, null); } public TikaPoweredMetadataExtracter(HashSet supportedMimeTypes, HashSet supportedEmbedMimeTypes) { this(null, supportedMimeTypes, supportedEmbedMimeTypes); } public TikaPoweredMetadataExtracter(String extractorContext, HashSet supportedMimeTypes, HashSet supportedEmbedMimeTypes) { super(supportedMimeTypes, supportedEmbedMimeTypes); this.extractorContext = extractorContext; // TODO Once TIKA-451 is fixed this list will get nicer DateTimeParser[] parsersUTC = { DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss'Z'").getParser(), DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ssZ").getParser() }; DateTimeParser[] parsers = { DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss").getParser(), DateTimeFormat.forPattern("yyyy-MM-dd").getParser(), DateTimeFormat.forPattern("yyyy/MM/dd HH:mm:ss").getParser(), DateTimeFormat.forPattern("yyyy/MM/dd").getParser(), DateTimeFormat.forPattern("EEE MMM dd hh:mm:ss zzz yyyy").getParser() }; this.tikaUTCDateFormater = new DateTimeFormatterBuilder().append(null, parsersUTC).toFormatter().withZone(DateTimeZone.UTC); this.tikaDateFormater = new DateTimeFormatterBuilder().append(null, parsers).toFormatter(); } /** * Gets context for the current implementation * * @return {@link String} value which determines current context */ protected String getExtractorContext() { return extractorContext; } /** * Version which also tries the ISO-8601 formats (in order..), * and similar formats, which Tika makes use of */ @Override protected Date makeDate(String dateStr) { // Try our formats first, in order try { return this.tikaUTCDateFormater.parseDateTime(dateStr).toDate(); } catch (IllegalArgumentException e) {} try { return this.tikaUTCDateFormater.withLocale(Locale.US).parseDateTime(dateStr).toDate(); } catch (IllegalArgumentException e) {} try { return this.tikaDateFormater.parseDateTime(dateStr).toDate(); } catch (IllegalArgumentException e) {} try { return this.tikaDateFormater.withLocale(Locale.US).parseDateTime(dateStr).toDate(); } catch (IllegalArgumentException e) {} // Fall back to the normal ones return super.makeDate(dateStr); } /** * Returns the correct Tika Parser to process the document. * If you don't know which you want, use {@link TikaAutoMetadataExtracter} * which makes use of the Tika auto-detection. */ protected abstract Parser getParser(); /** * Returns the Tika Embedder to modify * the document. * * @return the Tika embedder */ protected Embedder getEmbedder() { // TODO make this an abstract method once more extracters support embedding return null; } /** * Do we care about the contents of the * extracted header, or nothing at all? */ protected boolean needHeaderContents() { return false; } /** * Allows implementation specific mappings to be done. */ protected Map extractSpecific(Metadata metadata, Map properties, Map headers) { return properties; } /** * There seems to be some sort of issue with some downstream * 3rd party libraries, and input streams that come from * a {@link ContentReader}. This happens most often with * JPEG and Tiff files. * For these cases, buffer out to a local file if not * already there */ protected InputStream getInputStream(ContentReader reader) throws IOException { // Prefer the File if available, it's generally quicker if(reader instanceof FileContentReader) { return TikaInputStream.get( ((FileContentReader)reader).getFile() ); } // Grab the InputStream for the Content InputStream input = reader.getContentInputStream(); // Images currently always require a file if(MimetypeMap.MIMETYPE_IMAGE_JPEG.equals(reader.getMimetype()) || MimetypeMap.MIMETYPE_IMAGE_TIFF.equals(reader.getMimetype())) { TemporaryResources tmp = new TemporaryResources(); TikaInputStream stream = TikaInputStream.get(input, tmp); stream.getFile(); // Have it turned into File backed return stream; } else { // The regular Content InputStream should be fine return input; } } /** * Sets the document selector, used for determining whether to parse embedded resources. * * @param documentSelector */ public void setDocumentSelector(DocumentSelector documentSelector) { this.documentSelector = documentSelector; } /** * Gets the document selector, used for determining whether to parse embedded resources, * null by default so parse all. * * @param metadata * @param targetMimeType * @return the document selector */ protected DocumentSelector getDocumentSelector(Metadata metadata, String targetMimeType) { return documentSelector; } /** * By default returns a new ParseContent * * @param metadata * @param sourceMimeType * @return the parse context */ protected ParseContext buildParseContext(Metadata metadata, String sourceMimeType) { ParseContext context = new ParseContext(); DocumentSelector selector = getDocumentSelector(metadata, sourceMimeType); if (selector != null) { context.set(DocumentSelector.class, selector); } return context; } @SuppressWarnings("deprecation") @Override protected Map extractRaw(ContentReader reader) throws Throwable { Map rawProperties = newRawMap(); InputStream is = null; // Parse using properties of the context of current implementation boolean contextPresented = null != extractorContext; try { is = getInputStream(reader); Parser parser = getParser(); Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, reader.getMimetype()); ParseContext context = buildParseContext(metadata, reader.getMimetype()); ContentHandler handler; Map headers = null; if(needHeaderContents()) { MapCaptureContentHandler headerCapture = new MapCaptureContentHandler(); headers = headerCapture.tags; handler = new HeadContentHandler(headerCapture); } else { handler = new NullContentHandler(); } // Set POI properties context if available... if (contextPresented) { AlfrescoPoiPatchUtils.setContext(extractorContext); } parser.parse(is, handler, metadata, context); // First up, copy all the Tika metadata over // This allows people to map any of the Tika // keys onto their own content model for(String tikaKey : metadata.names()) { putRawValue(tikaKey, getMetadataValue(metadata, tikaKey), rawProperties); } // Now, map the common Tika metadata keys onto // the common Alfresco metadata keys. This allows // existing mapping properties files to continue // to work without needing any changes // The simple ones putRawValue(KEY_AUTHOR, getMetadataValue(metadata, Metadata.AUTHOR), rawProperties); putRawValue(KEY_TITLE, getMetadataValue(metadata, Metadata.TITLE), rawProperties); putRawValue(KEY_COMMENTS, getMetadataValue(metadata, Metadata.COMMENTS), rawProperties); // Get the subject and description, despite things not // being nearly as consistent as one might hope String subject = getMetadataValue(metadata, Metadata.SUBJECT); String description = getMetadataValue(metadata, Metadata.DESCRIPTION); if(subject != null && description != null) { putRawValue(KEY_DESCRIPTION, description, rawProperties); putRawValue(KEY_SUBJECT, subject, rawProperties); } else if(subject != null) { putRawValue(KEY_DESCRIPTION, subject, rawProperties); putRawValue(KEY_SUBJECT, subject, rawProperties); } else if(description != null) { putRawValue(KEY_DESCRIPTION, description, rawProperties); putRawValue(KEY_SUBJECT, description, rawProperties); } // Try for the dates two different ways too if(metadata.get(Metadata.CREATION_DATE) != null) { putRawValue(KEY_CREATED, metadata.get(Metadata.CREATION_DATE), rawProperties); } else if(metadata.get(Metadata.DATE) != null) { putRawValue(KEY_CREATED, metadata.get(Metadata.DATE), rawProperties); } // If people created a specific instance // (eg OfficeMetadataExtractor), then allow that // instance to map the Tika keys onto its // existing namespace so that older properties // files continue to map correctly rawProperties = extractSpecific(metadata, rawProperties, headers); } finally { // Reset POI properties context if (contextPresented) { AlfrescoPoiPatchUtils.setContext(null); } if (is != null) { try { is.close(); } catch (IOException e) {} } } return rawProperties; } @Override protected void embedInternal(Map properties, ContentReader reader, ContentWriter writer) throws Throwable { Embedder embedder = getEmbedder(); if (embedder == null) { return; } Metadata metadataToEmbed = new Metadata(); for (String metadataKey : properties.keySet()) { Serializable value = properties.get(metadataKey); if (value == null) { continue; } if (value instanceof Collection) { for (Object singleValue : (Collection) value) { try { // Convert to a string value for Tika metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, singleValue)); } catch (TypeConversionException e) { logger.info("Could not convert " + metadataKey + ": " + e.getMessage()); } } } else { try { // Convert to a string value for Tika metadataToEmbed.add(metadataKey, DefaultTypeConverter.INSTANCE.convert(String.class, value)); } catch (TypeConversionException e) { logger.info("Could not convert " + metadataKey + ": " + e.getMessage()); } } } InputStream inputStream = getInputStream(reader); OutputStream outputStream = writer.getContentOutputStream(); embedder.embed(metadataToEmbed, inputStream, outputStream, null); } private String getMetadataValue(Metadata metadata, String key) { if (metadata.isMultiValued(key)) { String[] parts = metadata.getValues(key); // use Set to prevent duplicates Set value = new LinkedHashSet(parts.length); for (int i = 0; i < parts.length; i++) { value.add(parts[i]); } String valueStr = value.toString(); // remove leading/trailing braces [] return valueStr.substring(1, valueStr.length() - 1); } else { return metadata.get(key); } } /** * Exif metadata for size also returns the string "pixels" * after the number value , this function will * stop at the first non digit character found in the text * @param sizeText string text * @return the size value */ protected String extractSize(String sizeText) { StringBuilder sizeValue = new StringBuilder(); for(char c : sizeText.toCharArray()) { if(Character.isDigit(c)) { sizeValue.append(c); } else { break; } } return sizeValue.toString(); } /** * This content handler will capture entries from within * the header of the Tika content XHTML, but ignore the * rest. */ protected static class HeadContentHandler extends ContentHandlerDecorator { /** * XHTML XPath parser. */ private static final XPathParser PARSER = new XPathParser("xhtml", XHTMLContentHandler.XHTML); /** * The XPath matcher used to select the XHTML body contents. */ private static final Matcher MATCHER = PARSER.parse("/xhtml:html/xhtml:head/descendant:node()"); /** * Creates a content handler that passes all XHTML body events to the * given underlying content handler. * * @param handler content handler */ protected HeadContentHandler(ContentHandler handler) { super(new MatchingContentHandler(handler, MATCHER)); } } /** * This content handler will grab all tags and attributes, * and record the textual content of the last seen one * of them. * Normally only used with {@link HeadContentHandler} */ protected static class MapCaptureContentHandler implements ContentHandler { protected Map tags = new HashMap(); private StringBuffer text; public void characters(char[] ch, int start, int len) { if(text != null) { text.append(ch, start, len); } } public void endElement(String namespace, String localname, String qname) { if(text != null && text.length() > 0) { tags.put(qname, text.toString()); } text = null; } public void startElement(String namespace, String localname, String qname, Attributes attrs) { for(int i=0; i