/* * Copyright (C) 2005-2010 Alfresco Software Limited. * * This file is part of Alfresco * * Alfresco is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Alfresco is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Alfresco. If not, see . */ package org.alfresco.repo.content.metadata; import java.util.ArrayList; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tika.config.TikaConfig; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; /** * A Metadata Extractor which makes use of the Apache * Tika auto-detection to select the best parser * to extract the metadata from your document. * This will be used for all files which Tika can * handle, but where no other more explicit * extractor is defined. *
 *   author:                 --      cm:author
 *   title:                  --      cm:title
 *   subject:                --      cm:description
 *   created:                --      cm:created
 *   comments:
 *   

geo:lat: -- cm:latitude *

geo:long: -- cm:longitude *

* * @since 3.4 * @author Nick Burch */ public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter { protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class); private static AutoDetectParser parser; private static TikaConfig config; public static ArrayList SUPPORTED_MIMETYPES; private static ArrayList buildMimeTypes(TikaConfig tikaConfig) { config = tikaConfig; parser = new AutoDetectParser(config); SUPPORTED_MIMETYPES = new ArrayList(); for(MediaType mt : parser.getParsers().keySet()) { // Add the canonical mime type SUPPORTED_MIMETYPES.add( mt.toString() ); // And add any aliases of the mime type too - Alfresco uses some // non canonical forms of various mimetypes, so we need all of them for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt)) { SUPPORTED_MIMETYPES.add( alias.toString() ); } } return SUPPORTED_MIMETYPES; } public TikaAutoMetadataExtracter(TikaConfig tikaConfig) { super( buildMimeTypes(tikaConfig) ); } /** * Does auto-detection to select the best Tika * Parser. */ @Override protected Parser getParser() { return parser; } }