mirror of
				https://github.com/Alfresco/alfresco-community-repo.git
				synced 2025-10-29 15:21:53 +00:00 
			
		
		
		
	107541: Merged 5.0.N (5.0.3) to HEAD-BUG-FIX (5.1/Cloud) (PARTIAL MERGE)
      107413: Merged DEV to 5.0.N (5.0.3)
         106858 : MNT-13545: JavaDoc : Inconsistencies between the Java doc and the actual code
            - Cleaning of Javadoc,
   107565: MNT-13545 Fix compilation after merge of Javadoc
git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@107633 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
		
	
		
			
				
	
	
		
			109 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			Java
		
	
	
	
	
	
			
		
		
	
	
			109 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			Java
		
	
	
	
	
	
| /*
 | |
|  * Copyright (C) 2005-2010 Alfresco Software Limited.
 | |
|  *
 | |
|  * This file is part of Alfresco
 | |
|  *
 | |
|  * Alfresco is free software: you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU Lesser General Public License as published by
 | |
|  * the Free Software Foundation, either version 3 of the License, or
 | |
|  * (at your option) any later version.
 | |
|  *
 | |
|  * Alfresco is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  * GNU Lesser General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Lesser General Public License
 | |
|  * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 | |
|  */
 | |
| package org.alfresco.repo.content;
 | |
| 
 | |
| import java.io.IOException;
 | |
| import java.io.InputStream;
 | |
| import java.io.PushbackInputStream;
 | |
| import java.util.HashSet;
 | |
| import java.util.Set;
 | |
| 
 | |
| import org.apache.poi.poifs.common.POIFSConstants;
 | |
| import org.apache.poi.util.IOUtils;
 | |
| import org.apache.tika.exception.TikaException;
 | |
| import org.apache.tika.metadata.Metadata;
 | |
| import org.apache.tika.mime.MediaType;
 | |
| import org.apache.tika.parser.ParseContext;
 | |
| import org.apache.tika.parser.Parser;
 | |
| import org.apache.tika.parser.microsoft.OfficeParser;
 | |
| import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
 | |
| import org.xml.sax.ContentHandler;
 | |
| import org.xml.sax.SAXException;
 | |
| 
 | |
| /**
 | |
|  * <a href="http://tika.apache.org/Apache Tika">Apache Tika</a> assumes that
 | |
|  *  you either know exactly what your content is, or that
 | |
|  *  you'll leave it to auto-detection.
 | |
|  * Within Alfresco, we usually do know. However, from time
 | |
|  *  to time, we don't know if we have one of the old or one
 | |
|  *  of the new office files (eg .xls and .xlsx).
 | |
|  * This class allows automatically selects the appropriate
 | |
|  *  old (OLE2) or new (OOXML) Tika parser as required.
 | |
|  *    
 | |
|  * @author Nick Burch
 | |
|  */
 | |
| public class TikaOfficeDetectParser implements Parser {
 | |
|    private Parser ole2Parser = new OfficeParser();
 | |
|    private Parser ooxmlParser = new OOXMLParser();
 | |
| 
 | |
|    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
 | |
|       Set<MediaType> types = new HashSet<MediaType>();
 | |
|       types.addAll(ole2Parser.getSupportedTypes(parseContext));
 | |
|       types.addAll(ooxmlParser.getSupportedTypes(parseContext));
 | |
|       return types;
 | |
|    }
 | |
| 
 | |
|    public void parse(InputStream stream,
 | |
|          ContentHandler handler, Metadata metadata,
 | |
|          ParseContext parseContext) throws IOException, SAXException,
 | |
|          TikaException 
 | |
|    {
 | |
|       byte[] initial4 = new byte[4];
 | |
|       InputStream wrapped;
 | |
|       // Preserve TikaInputStreams as TikaInputStreams as they require less memory to process
 | |
|       if (stream.markSupported())
 | |
|       {
 | |
|          stream.mark(initial4.length);
 | |
|          IOUtils.readFully(stream, initial4);
 | |
|          stream.reset();
 | |
|          wrapped = stream;
 | |
|       }
 | |
|       else
 | |
|       {
 | |
|          PushbackInputStream inp = new PushbackInputStream(stream, 4);
 | |
|          IOUtils.readFully(inp, initial4);
 | |
|          inp.unread(initial4);
 | |
|          wrapped = inp;
 | |
|       }
 | |
|       
 | |
|       // Which is it?
 | |
|       if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
 | |
|          initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
 | |
|          initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
 | |
|          initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3])
 | |
|       {
 | |
|          ooxmlParser.parse(wrapped, handler, metadata, parseContext);
 | |
|       }
 | |
|       else
 | |
|       {
 | |
|          ole2Parser.parse(wrapped, handler, metadata, parseContext);
 | |
|       }
 | |
|    }
 | |
| 
 | |
|    /**
 | |
|     * @deprecated This method will be removed in Apache Tika 1.0.
 | |
|     */
 | |
|    public void parse(InputStream stream,
 | |
|          ContentHandler handler, Metadata metadata)
 | |
|          throws IOException, SAXException, TikaException 
 | |
|    {
 | |
|       parse(stream, handler, metadata, new ParseContext());
 | |
|    }
 | |
| }
 |