/*
 * #%L
 * Alfresco Repository
 * %%
 * Copyright (C) 2005 - 2016 Alfresco Software Limited
 * %%
 * This file is part of the Alfresco software. 
 * If the software was purchased under a paid Alfresco license, the terms of 
 * the paid license agreement will prevail.  Otherwise, the software is 
 * provided under the following open source license terms:
 * 
 * Alfresco is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Alfresco is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with Alfresco. If not, see .
 * #L%
 */
/*
 * Copyright (C) 2005 Jesper Steen Møller
 *
 * This file is part of Alfresco
 *
 * Alfresco is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Alfresco is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Alfresco. If not, see .
 */
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Map;
import org.alfresco.repo.content.MimetypeMap;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
/**
 * Office file format Metadata Extracter.  This extracter uses the POI library to extract
 * the following:
 * 
 *   author:             --      cm:author
 *   title:              --      cm:title
 *   subject:            --      cm:description
 *   createDateTime:     --      cm:created
 *   lastSaveDateTime:   --      cm:modified
 *   comments:
 *   editTime:
 *   format:
 *   keywords:
 *   lastAuthor:
 *   lastPrinted:
 *   osVersion:
 *   thumbnail:
 *   pageCount:
 *   wordCount:
 * 
 * 
 * Uses Apache Tika
 *
 * @author Derek Hulley
 * @author Nick Burch
 */
public class OfficeMetadataExtracter extends TikaPoweredMetadataExtracter
{
    public static final String KEY_CREATE_DATETIME = "createDateTime";
    public static final String KEY_LAST_SAVE_DATETIME = "lastSaveDateTime";
    public static final String KEY_EDIT_TIME = "editTime";
    public static final String KEY_FORMAT = "format";
    public static final String KEY_KEYWORDS = "keywords";
    public static final String KEY_LAST_AUTHOR = "lastAuthor";
    public static final String KEY_LAST_PRINTED = "lastPrinted";
    public static final String KEY_OS_VERSION = "osVersion"; // TODO
    public static final String KEY_THUMBNAIL = "thumbnail"; // TODO
    public static final String KEY_PAGE_COUNT = "pageCount";
    public static final String KEY_PARAGRAPH_COUNT = "paragraphCount";
    public static final String KEY_WORD_COUNT = "wordCount";
    
    public static ArrayList SUPPORTED_MIMETYPES = buildSupportedMimetypes(
          new String[] {
              MimetypeMap.MIMETYPE_WORD,
              MimetypeMap.MIMETYPE_EXCEL,
              MimetypeMap.MIMETYPE_PPT,
              MimetypeMap.MIMETYPE_VISIO,
              MimetypeMap.MIMETYPE_VISIO_2013 },
          new OfficeParser()
    );
    static {
       // Outlook has it's own one!
       SUPPORTED_MIMETYPES.remove(MimetypeMap.MIMETYPE_OUTLOOK_MSG);
    }
    public OfficeMetadataExtracter()
    {
        super(SUPPORTED_MIMETYPES);
    }
    
    @Override
    protected Parser getParser() 
    {
      return new OfficeParser();
    }
    @SuppressWarnings("deprecation")
    @Override
    protected Map extractSpecific(Metadata metadata,
         Map properties, Map headers) 
    {
       putRawValue(KEY_CREATE_DATETIME, metadata.get(Metadata.CREATION_DATE), properties); 
       putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(Metadata.LAST_SAVED), properties);
       putRawValue(KEY_EDIT_TIME, metadata.get(Metadata.EDIT_TIME), properties);
       putRawValue(KEY_FORMAT, metadata.get(Metadata.FORMAT), properties);
       putRawValue(KEY_KEYWORDS, metadata.get(Metadata.KEYWORDS), properties);
       putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties);
       putRawValue(KEY_LAST_PRINTED, metadata.get(Metadata.LAST_PRINTED), properties);
//       putRawValue(KEY_OS_VERSION, metadata.get(Metadata.OS_VERSION), properties);
//       putRawValue(KEY_THUMBNAIL, metadata.get(Metadata.THUMBNAIL), properties);
       putRawValue(KEY_PAGE_COUNT, metadata.get(Metadata.PAGE_COUNT), properties);
       putRawValue(KEY_PARAGRAPH_COUNT, metadata.get(Metadata.PARAGRAPH_COUNT), properties);
       putRawValue(KEY_WORD_COUNT, metadata.get(Metadata.WORD_COUNT), properties);
       return properties;
    }
}