/*
 * Copyright (C) 2005-2010 Alfresco Software Limited.
 *
 * This file is part of Alfresco
 *
 * Alfresco is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Alfresco is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Alfresco. If not, see .
 */
package org.alfresco.repo.content.metadata;
import java.util.ArrayList;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
/**
 * A Metadata Extractor which makes use of the Apache
 *  Tika auto-detection to select the best parser
 *  to extract the metadata from your document.
 * This will be used for all files which Tika can
 *  handle, but where no other more explicit
 *  extractor is defined. 
 * 
 *   author:                 --      cm:author
 *   title:                  --      cm:title
 *   subject:                --      cm:description
 *   created:                --      cm:created
 *   comments:
 *   geo:lat:                --      cm:latitude
 *   
geo:long:               --      cm:longitude
 * 
 * 
 * @since 3.4
 * @author Nick Burch
 */
public class TikaAutoMetadataExtracter extends TikaPoweredMetadataExtracter
{
    protected static Log logger = LogFactory.getLog(TikaAutoMetadataExtracter.class);
    private static AutoDetectParser parser;
    private static TikaConfig config;
    public static ArrayList SUPPORTED_MIMETYPES;
    private static ArrayList buildMimeTypes(TikaConfig tikaConfig)
    {
       config = tikaConfig;
       parser = new AutoDetectParser(config);
       SUPPORTED_MIMETYPES = new ArrayList();
       for(MediaType mt : parser.getParsers().keySet()) 
       {
          // Add the canonical mime type
          SUPPORTED_MIMETYPES.add( mt.toString() );
          
          // And add any aliases of the mime type too - Alfresco uses some
          //  non canonical forms of various mimetypes, so we need all of them
          for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt)) 
          {
              SUPPORTED_MIMETYPES.add( alias.toString() );
          }
       }
       return SUPPORTED_MIMETYPES;
    }
    
    public TikaAutoMetadataExtracter(TikaConfig tikaConfig)
    {
       super( buildMimeTypes(tikaConfig) );
    }
    
    /**
     * Does auto-detection to select the best Tika
     *  Parser.
     */
    @Override
    protected Parser getParser() 
    {
       return parser;
    }
}