mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-06-30 18:15:39 +00:00
Enable explicit Tika content transform for OOXML files Allow the Excel transformer (which does CSV as well as text/html) to handle .xlsx as well as .xls Also update the .doc parser test to ensure that the older word 6 and word 95 files are correctly handled too git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20781 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
96 lines
3.4 KiB
Java
96 lines
3.4 KiB
Java
/*
|
|
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
|
*
|
|
* This file is part of Alfresco
|
|
*
|
|
* Alfresco is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Alfresco is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
package org.alfresco.repo.content;
|
|
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.PushbackInputStream;
|
|
import java.util.HashSet;
|
|
import java.util.Set;
|
|
|
|
import org.apache.poi.poifs.common.POIFSConstants;
|
|
import org.apache.poi.util.IOUtils;
|
|
import org.apache.tika.exception.TikaException;
|
|
import org.apache.tika.metadata.Metadata;
|
|
import org.apache.tika.mime.MediaType;
|
|
import org.apache.tika.parser.ParseContext;
|
|
import org.apache.tika.parser.Parser;
|
|
import org.apache.tika.parser.microsoft.OfficeParser;
|
|
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
|
import org.xml.sax.ContentHandler;
|
|
import org.xml.sax.SAXException;
|
|
|
|
/**
|
|
* {@link http://tika.apache.org/ Apache Tika} assumes that
|
|
* you either know exactly what your content is, or that
|
|
* you'll leave it to auto-detection.
|
|
* Within Alfresco, we usually do know. However, from time
|
|
* to time, we don't know if we have one of the old or one
|
|
* of the new office files (eg .xls and .xlsx).
|
|
* This class allows automatically selects the appropriate
|
|
* old (OLE2) or new (OOXML) Tika parser as required.
|
|
*
|
|
* @author Nick Burch
|
|
*/
|
|
public class TikaOfficeDetectParser implements Parser {
|
|
private Parser ole2Parser = new OfficeParser();
|
|
private Parser ooxmlParser = new OOXMLParser();
|
|
|
|
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
|
|
Set<MediaType> types = new HashSet<MediaType>();
|
|
types.addAll(ole2Parser.getSupportedTypes(parseContext));
|
|
types.addAll(ooxmlParser.getSupportedTypes(parseContext));
|
|
return types;
|
|
}
|
|
|
|
public void parse(InputStream stream,
|
|
ContentHandler handler, Metadata metadata,
|
|
ParseContext parseContext) throws IOException, SAXException,
|
|
TikaException
|
|
{
|
|
PushbackInputStream inp = new PushbackInputStream(stream, 4);
|
|
byte[] initial4 = new byte[4];
|
|
IOUtils.readFully(inp, initial4);
|
|
inp.unread(initial4);
|
|
|
|
// Which is it?
|
|
if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
|
|
initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
|
|
initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
|
|
initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3])
|
|
{
|
|
ooxmlParser.parse(inp, handler, metadata, parseContext);
|
|
}
|
|
else
|
|
{
|
|
ole2Parser.parse(inp, handler, metadata, parseContext);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @deprecated This method will be removed in Apache Tika 1.0.
|
|
*/
|
|
public void parse(InputStream stream,
|
|
ContentHandler handler, Metadata metadata)
|
|
throws IOException, SAXException, TikaException
|
|
{
|
|
parse(stream, handler, metadata, new ParseContext());
|
|
}
|
|
}
|