mirror of
				https://github.com/Alfresco/alfresco-community-repo.git
				synced 2025-10-15 15:02:20 +00:00 
			
		
		
		
	Enable explicit Tika content transform for OOXML files Allow the Excel transformer (which does CSV as well as text/html) to handle .xlsx as well as .xls Also update the .doc parser test to ensure that the older word 6 and word 95 files are correctly handled too git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20781 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
		
			
				
	
	
		
			96 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Java
		
	
	
	
	
	
			
		
		
	
	
			96 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Java
		
	
	
	
	
	
| /*
 | |
|  * Copyright (C) 2005-2010 Alfresco Software Limited.
 | |
|  *
 | |
|  * This file is part of Alfresco
 | |
|  *
 | |
|  * Alfresco is free software: you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU Lesser General Public License as published by
 | |
|  * the Free Software Foundation, either version 3 of the License, or
 | |
|  * (at your option) any later version.
 | |
|  *
 | |
|  * Alfresco is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  * GNU Lesser General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Lesser General Public License
 | |
|  * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 | |
|  */
 | |
| package org.alfresco.repo.content;
 | |
| 
 | |
| import java.io.IOException;
 | |
| import java.io.InputStream;
 | |
| import java.io.PushbackInputStream;
 | |
| import java.util.HashSet;
 | |
| import java.util.Set;
 | |
| 
 | |
| import org.apache.poi.poifs.common.POIFSConstants;
 | |
| import org.apache.poi.util.IOUtils;
 | |
| import org.apache.tika.exception.TikaException;
 | |
| import org.apache.tika.metadata.Metadata;
 | |
| import org.apache.tika.mime.MediaType;
 | |
| import org.apache.tika.parser.ParseContext;
 | |
| import org.apache.tika.parser.Parser;
 | |
| import org.apache.tika.parser.microsoft.OfficeParser;
 | |
| import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
 | |
| import org.xml.sax.ContentHandler;
 | |
| import org.xml.sax.SAXException;
 | |
| 
 | |
| /**
 | |
|  * {@link http://tika.apache.org/ Apache Tika} assumes that
 | |
|  *  you either know exactly what your content is, or that
 | |
|  *  you'll leave it to auto-detection.
 | |
|  * Within Alfresco, we usually do know. However, from time
 | |
|  *  to time, we don't know if we have one of the old or one
 | |
|  *  of the new office files (eg .xls and .xlsx).
 | |
|  * This class allows automatically selects the appropriate
 | |
|  *  old (OLE2) or new (OOXML) Tika parser as required.
 | |
|  *    
 | |
|  * @author Nick Burch
 | |
|  */
 | |
| public class TikaOfficeDetectParser implements Parser {
 | |
|    private Parser ole2Parser = new OfficeParser();
 | |
|    private Parser ooxmlParser = new OOXMLParser();
 | |
| 
 | |
|    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
 | |
|       Set<MediaType> types = new HashSet<MediaType>();
 | |
|       types.addAll(ole2Parser.getSupportedTypes(parseContext));
 | |
|       types.addAll(ooxmlParser.getSupportedTypes(parseContext));
 | |
|       return types;
 | |
|    }
 | |
| 
 | |
|    public void parse(InputStream stream,
 | |
|          ContentHandler handler, Metadata metadata,
 | |
|          ParseContext parseContext) throws IOException, SAXException,
 | |
|          TikaException 
 | |
|    {
 | |
|       PushbackInputStream inp = new PushbackInputStream(stream, 4);
 | |
|       byte[] initial4 = new byte[4];
 | |
|       IOUtils.readFully(inp, initial4);
 | |
|       inp.unread(initial4);
 | |
|       
 | |
|       // Which is it?
 | |
|       if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
 | |
|          initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
 | |
|          initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
 | |
|          initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3])
 | |
|       {
 | |
|          ooxmlParser.parse(inp, handler, metadata, parseContext);
 | |
|       }
 | |
|       else
 | |
|       {
 | |
|          ole2Parser.parse(inp, handler, metadata, parseContext);
 | |
|       }
 | |
|    }
 | |
| 
 | |
|    /**
 | |
|     * @deprecated This method will be removed in Apache Tika 1.0.
 | |
|     */
 | |
|    public void parse(InputStream stream,
 | |
|          ContentHandler handler, Metadata metadata)
 | |
|          throws IOException, SAXException, TikaException 
 | |
|    {
 | |
|       parse(stream, handler, metadata, new ParseContext());
 | |
|    }
 | |
| }
 |