mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-08-07 17:49:17 +00:00
Merged DEV/SWIFT to HEAD (FTP Tests, Tika and Poi)
26059: ALF-5900 - IMAP creates winmail.dat in attachment folder (Add support for Microsoft Transport Neutral Encapsulation Format.) - added attachment extraction for TNEF documents - goodbye winmail.dat ! 26063: javadoc for imap. 26088: ALF-7408 - addition of commons-net for ftp client library. First test of end to end ftp. Just a simple test of connection now, will be followed by more detailed tests. 26176: ALF-7408 - FTP tests + disabled failing test case for ALF-7618 26180: ALF-7618 - correction of unit test error. 26188: ALF-7618 - added a test of paths 26229: Added back simple '\~.*' pattern 26288: ALF-7676 - Test to stress different user rights. - FTPServerTest.testTwoUserUpdate added for the FTP server. 26304: Corrected spelling name in private class. 26408: addming minimal package infos. 26416: ALF-5082 / ALF-2183 / ALF-4448 - When guessing the mimetype for a file, add the option to supply a ContentReader to enhance the accuracy. Enable this for a few key places that do mimetype guessing, which should avoid issues for files with the wrong extension (either renamed accidently, or for .TMP) 26433: Re-order the mimetype guess step to ensure that the Content Reader is always valid 26440: Added another test for word 2003 save as. 26441: Test resource for ContentDiskDriver 26446: ALF-5082 - Back out a FileFolderService change to mimetype guessing, which had broken things, pending a better way to do it with ContentWriter 26490: Small change for ContentDiskDriverTes.fileExists. Leaky transaction causing problems in automated build. 26497: ContentDiskDriver - commented out two of the problematic leaky transaction tests. 26503: Add new interface methods + documentation for asking a ContentWriter to guess the mimetype and encoding for you. (Code will be migrated from places that currently do this themselves later) 26504: Add an extension interface in the DataModel project for some of the extra ContentReader methods that FileContentReader provides 26505: When ContentWriter.putContent(String) is called with no encoding specified, record what the system default encoding was that was used. (Prevents issues if the system default is ever changed) 26509: When calling Tika to do file detection, if we have a file based reader then give Tika the File rather than an InputStream 26522: More debug logging while debugging ALF-5260 26546: Have one copy of the Tika Config in spring, rather than several places fetching their own copy of the default one (either explicitly or implicitly). 26522: More debug logging while diagnosing ALF-5260 26548: Add another mimetype check - ensures that truncated/corrup container files which can't be fully processed can still get the container type without failure 26549: Implement the mimetype and encoding guessers on ContentWriter (either immediately or as a listener, as required), and update FileFolderServer to make use of this (+test this) 26553: Replace explicit mimetype and encoding guess calls with ContentWriter requests to have the work done 26554: Replace explicit mimetype and encoding guess calls with ContentWriter requests to have the work done 26579: Switch the transformer to use Tika git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@28224 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -29,16 +29,19 @@ import java.nio.channels.Channels;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.channels.ReadableByteChannel;
|
||||
import java.nio.channels.WritableByteChannel;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.alfresco.error.AlfrescoRuntimeException;
|
||||
import org.alfresco.repo.content.encoding.ContentCharsetFinder;
|
||||
import org.alfresco.repo.content.filestore.FileContentWriter;
|
||||
import org.alfresco.service.cmr.repository.ContentAccessor;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentStreamListener;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.MimetypeService;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
@@ -62,6 +65,8 @@ public abstract class AbstractContentWriter extends AbstractContentAccessor impl
|
||||
private List<ContentStreamListener> listeners;
|
||||
private WritableByteChannel channel;
|
||||
private ContentReader existingContentReader;
|
||||
private MimetypeService mimetypeService;
|
||||
private DoGuessingOnCloseListener guessingOnCloseListener;
|
||||
|
||||
/**
|
||||
* @param contentUrl the content URL
|
||||
@@ -73,6 +78,21 @@ public abstract class AbstractContentWriter extends AbstractContentAccessor impl
|
||||
this.existingContentReader = existingContentReader;
|
||||
|
||||
listeners = new ArrayList<ContentStreamListener>(2);
|
||||
|
||||
// We always register our own listener as the first one
|
||||
// This allows us to perform any guessing (if needed) before
|
||||
// the normal listeners kick in and eg write things to the DB
|
||||
guessingOnCloseListener = new DoGuessingOnCloseListener();
|
||||
listeners.add(guessingOnCloseListener);
|
||||
}
|
||||
|
||||
/**
|
||||
* Supplies the Mimetype Service to be used when guessing
|
||||
* encoding and mimetype information.
|
||||
*/
|
||||
public void setMimetypeService(MimetypeService mimetypeService)
|
||||
{
|
||||
this.mimetypeService = mimetypeService;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -454,7 +474,19 @@ public abstract class AbstractContentWriter extends AbstractContentAccessor impl
|
||||
{
|
||||
// attempt to use the correct encoding
|
||||
String encoding = getEncoding();
|
||||
byte[] bytes = (encoding == null) ? content.getBytes() : content.getBytes(encoding);
|
||||
byte[] bytes;
|
||||
if(encoding == null)
|
||||
{
|
||||
// Use the system default, and record what that was
|
||||
bytes = content.getBytes();
|
||||
setEncoding( System.getProperty("file.encoding") );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use the encoding that they specified
|
||||
bytes = content.getBytes(encoding);
|
||||
}
|
||||
|
||||
// get the stream
|
||||
OutputStream os = getContentOutputStream();
|
||||
ByteArrayInputStream is = new ByteArrayInputStream(bytes);
|
||||
@@ -469,4 +501,108 @@ public abstract class AbstractContentWriter extends AbstractContentAccessor impl
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* When the content has been written, attempt to guess
|
||||
* the encoding of it.
|
||||
*
|
||||
* @see ContentWriter#guessEncoding()
|
||||
*/
|
||||
public void guessEncoding()
|
||||
{
|
||||
if (mimetypeService == null)
|
||||
{
|
||||
logger.warn("MimetypeService not supplied, but required for content guessing");
|
||||
return;
|
||||
}
|
||||
|
||||
if(isClosed())
|
||||
{
|
||||
// Content written, can do it now
|
||||
doGuessEncoding();
|
||||
}
|
||||
else
|
||||
{
|
||||
// Content not yet written, wait for the
|
||||
// data to be written before doing so
|
||||
guessingOnCloseListener.guessEncoding = true;
|
||||
}
|
||||
}
|
||||
private void doGuessEncoding()
|
||||
{
|
||||
ContentCharsetFinder charsetFinder = mimetypeService.getContentCharsetFinder();
|
||||
|
||||
ContentReader reader = getReader();
|
||||
InputStream is = reader.getContentInputStream();
|
||||
Charset charset = charsetFinder.getCharset(is, getMimetype());
|
||||
try
|
||||
{
|
||||
is.close();
|
||||
}
|
||||
catch(IOException e)
|
||||
{}
|
||||
|
||||
setEncoding(charset.name());
|
||||
}
|
||||
|
||||
/**
|
||||
* When the content has been written, attempt to guess
|
||||
* the mimetype of it, using the filename and contents.
|
||||
*
|
||||
* @see ContentWriter#guessMimetype(String)
|
||||
*/
|
||||
public void guessMimetype(String filename)
|
||||
{
|
||||
if (mimetypeService == null)
|
||||
{
|
||||
logger.warn("MimetypeService not supplied, but required for content guessing");
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
if(isClosed())
|
||||
{
|
||||
// Content written, can do it now
|
||||
doGuessMimetype(filename);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Content not yet written, wait for the
|
||||
// data to be written before doing so
|
||||
guessingOnCloseListener.guessMimetype = true;
|
||||
guessingOnCloseListener.filename = filename;
|
||||
}
|
||||
}
|
||||
private void doGuessMimetype(String filename)
|
||||
{
|
||||
String mimetype = mimetypeService.guessMimetype(
|
||||
filename, getReader()
|
||||
);
|
||||
setMimetype(mimetype);
|
||||
}
|
||||
|
||||
/**
|
||||
* Our own listener that is always the first on the list,
|
||||
* which lets us perform guessing operations when the
|
||||
* content has been written.
|
||||
*/
|
||||
private class DoGuessingOnCloseListener implements ContentStreamListener
|
||||
{
|
||||
private boolean guessEncoding = false;
|
||||
private boolean guessMimetype = false;
|
||||
private String filename = null;
|
||||
|
||||
@Override
|
||||
public void contentStreamClosed() throws ContentIOException
|
||||
{
|
||||
if(guessMimetype)
|
||||
{
|
||||
doGuessMimetype(filename);
|
||||
}
|
||||
if(guessEncoding)
|
||||
{
|
||||
doGuessEncoding();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -50,6 +50,7 @@ import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentService;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.MimetypeService;
|
||||
import org.alfresco.service.cmr.repository.NoTransformerException;
|
||||
import org.alfresco.service.cmr.repository.NodeRef;
|
||||
import org.alfresco.service.cmr.repository.NodeService;
|
||||
@@ -85,6 +86,7 @@ public class ContentServiceImpl implements ContentService, ApplicationContextAwa
|
||||
private DictionaryService dictionaryService;
|
||||
private NodeService nodeService;
|
||||
private AVMService avmService;
|
||||
private MimetypeService mimetypeService;
|
||||
private RetryingTransactionHelper transactionHelper;
|
||||
private ApplicationContext applicationContext;
|
||||
|
||||
@@ -127,6 +129,11 @@ public class ContentServiceImpl implements ContentService, ApplicationContextAwa
|
||||
this.nodeService = nodeService;
|
||||
}
|
||||
|
||||
public void setMimetypeService(MimetypeService mimetypeService)
|
||||
{
|
||||
this.mimetypeService = mimetypeService;
|
||||
}
|
||||
|
||||
public void setTransformerRegistry(ContentTransformerRegistry transformerRegistry)
|
||||
{
|
||||
this.transformerRegistry = transformerRegistry;
|
||||
@@ -492,6 +499,12 @@ public class ContentServiceImpl implements ContentService, ApplicationContextAwa
|
||||
|
||||
}
|
||||
|
||||
// supply the writer with a copy of the mimetype service if needed
|
||||
if (writer instanceof AbstractContentWriter)
|
||||
{
|
||||
((AbstractContentWriter)writer).setMimetypeService(mimetypeService);
|
||||
}
|
||||
|
||||
// give back to the client
|
||||
return writer;
|
||||
}
|
||||
|
@@ -0,0 +1,109 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2010 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.alfresco.repo.content.filestore.FileContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.MimetypeService;
|
||||
import org.alfresco.util.DataModelTestApplicationContextHelper;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
import org.springframework.context.ApplicationContext;
|
||||
|
||||
/**
|
||||
* Content specific tests for MimeTypeMap
|
||||
*
|
||||
* @see org.alfresco.repo.content.MimetypeMap
|
||||
* @see org.alfresco.repo.content.MimetypeMapTest
|
||||
*/
|
||||
public class MimetypeMapContentTest extends TestCase
|
||||
{
|
||||
private static ApplicationContext ctx = DataModelTestApplicationContextHelper.getApplicationContext();
|
||||
|
||||
private MimetypeService mimetypeService;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception
|
||||
{
|
||||
mimetypeService = (MimetypeService)ctx.getBean("mimetypeService");
|
||||
}
|
||||
|
||||
public void testGuessMimetypeForFile() throws Exception
|
||||
{
|
||||
// Correct ones
|
||||
assertEquals(
|
||||
"application/msword",
|
||||
mimetypeService.guessMimetype("something.doc", openQuickTestFile("quick.doc"))
|
||||
);
|
||||
assertEquals(
|
||||
"application/msword",
|
||||
mimetypeService.guessMimetype("SOMETHING.DOC", openQuickTestFile("quick.doc"))
|
||||
);
|
||||
|
||||
// Incorrect ones, Tika spots the mistake
|
||||
assertEquals(
|
||||
"application/msword",
|
||||
mimetypeService.guessMimetype("something.pdf", openQuickTestFile("quick.doc"))
|
||||
);
|
||||
assertEquals(
|
||||
"application/pdf",
|
||||
mimetypeService.guessMimetype("something.doc", openQuickTestFile("quick.pdf"))
|
||||
);
|
||||
|
||||
// Ones where we use a different mimetype to the canonical one
|
||||
assertEquals(
|
||||
"image/bmp", // Officially image/x-ms-bmp
|
||||
mimetypeService.guessMimetype("image.bmp", openQuickTestFile("quick.bmp"))
|
||||
);
|
||||
|
||||
|
||||
// Where the file is corrupted
|
||||
File tmp = File.createTempFile("alfresco", ".tmp");
|
||||
ContentReader reader = openQuickTestFile("quick.doc");
|
||||
InputStream inp = reader.getContentInputStream();
|
||||
byte[] trunc = new byte[512+256];
|
||||
IOUtils.readFully(inp, trunc);
|
||||
inp.close();
|
||||
FileOutputStream out = new FileOutputStream(tmp);
|
||||
out.write(trunc);
|
||||
out.close();
|
||||
ContentReader truncReader = new FileContentReader(tmp);
|
||||
|
||||
// Because the file is truncated, Tika won't be able to process the contents
|
||||
// of the OLE2 structure
|
||||
// So, it'll fall back to just OLE2, but it won't fail
|
||||
assertEquals(
|
||||
"application/x-tika-msoffice",
|
||||
mimetypeService.guessMimetype("something.doc", truncReader)
|
||||
);
|
||||
}
|
||||
|
||||
private ContentReader openQuickTestFile(String filename)
|
||||
{
|
||||
URL url = getClass().getClassLoader().getResource("quick/" + filename);
|
||||
File file = new File(url.getFile());
|
||||
return new FileContentReader(file);
|
||||
}
|
||||
}
|
@@ -45,6 +45,7 @@ import org.apache.commons.logging.LogFactory;
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public class FileContentReader extends AbstractContentReader
|
||||
implements org.alfresco.service.cmr.repository.FileContentReader
|
||||
{
|
||||
/**
|
||||
* message key for missing content. Parameters are
|
||||
@@ -147,6 +148,9 @@ public class FileContentReader extends AbstractContentReader
|
||||
return file;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Whether the file exists or not
|
||||
*/
|
||||
public boolean exists()
|
||||
{
|
||||
return file.exists();
|
||||
|
@@ -23,6 +23,7 @@ import java.util.ArrayList;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
@@ -50,22 +51,11 @@ public class ArchiveContentTransformer extends TikaPoweredContentTransformer
|
||||
private static Log logger = LogFactory.getLog(ArchiveContentTransformer.class);
|
||||
|
||||
private boolean includeContents = false;
|
||||
public void setIncludeContents(String includeContents)
|
||||
{
|
||||
// Spring really ought to be able to handle
|
||||
// setting a boolean that might still be
|
||||
// ${foo} (i.e. not overridden in a property).
|
||||
// As we can't do that with spring, we do it...
|
||||
this.includeContents = false;
|
||||
if(includeContents != null && includeContents.length() > 0)
|
||||
{
|
||||
this.includeContents = TransformationOptions.relaxedBooleanTypeConverter.convert(includeContents).booleanValue();
|
||||
}
|
||||
}
|
||||
private TikaConfig tikaConfig;
|
||||
|
||||
/**
|
||||
* We support all the archive mimetypes that the Tika
|
||||
* office parser can handle
|
||||
* package parser can handle
|
||||
*/
|
||||
public static ArrayList<String> SUPPORTED_MIMETYPES;
|
||||
static {
|
||||
@@ -81,6 +71,29 @@ public class ArchiveContentTransformer extends TikaPoweredContentTransformer
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Injects the TikaConfig to use
|
||||
*
|
||||
* @param tikaConfig The Tika Config to use
|
||||
*/
|
||||
public void setTikaConfig(TikaConfig tikaConfig)
|
||||
{
|
||||
this.tikaConfig = tikaConfig;
|
||||
}
|
||||
|
||||
public void setIncludeContents(String includeContents)
|
||||
{
|
||||
// Spring really ought to be able to handle
|
||||
// setting a boolean that might still be
|
||||
// ${foo} (i.e. not overridden in a property).
|
||||
// As we can't do that with spring, we do it...
|
||||
this.includeContents = false;
|
||||
if(includeContents != null && includeContents.length() > 0)
|
||||
{
|
||||
this.includeContents = TransformationOptions.relaxedBooleanTypeConverter.convert(includeContents).booleanValue();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser() {
|
||||
return new PackageParser();
|
||||
@@ -96,9 +109,15 @@ public class ArchiveContentTransformer extends TikaPoweredContentTransformer
|
||||
{
|
||||
recurse = options.getIncludeEmbedded();
|
||||
}
|
||||
|
||||
if(recurse)
|
||||
{
|
||||
context.set(Parser.class, new AutoDetectParser());
|
||||
// Use an auto detect parser to handle the contents
|
||||
if(tikaConfig == null)
|
||||
{
|
||||
tikaConfig = TikaConfig.getDefaultConfig();
|
||||
}
|
||||
context.set(Parser.class, new AutoDetectParser(tikaConfig));
|
||||
}
|
||||
|
||||
return context;
|
||||
|
@@ -18,94 +18,27 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hwpf.OldWordFileFormatException;
|
||||
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
|
||||
/**
|
||||
* This badly named transformer turns Microsoft Word documents
|
||||
* (Word 6, 95, 97, 2000, 2003) into plain text.
|
||||
*
|
||||
* Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
|
||||
* do this, pending TIKA-408. When Apache POI 3.7 beta 2 has been
|
||||
* released, we can switch to Tika and then handle Word 6,
|
||||
* Word 95, Word 97, 2000, 2003, 2007 and 2010 formats.
|
||||
*
|
||||
* TODO Switch to Tika in November 2010 once 3.4 is out
|
||||
* (Word 6, 95, 97, 2000, 2003) into plain text, using Apache Tika.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class TextMiningContentTransformer extends AbstractContentTransformer2
|
||||
public class TextMiningContentTransformer extends TikaPoweredContentTransformer
|
||||
{
|
||||
public TextMiningContentTransformer()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Currently the only transformation performed is that of text extraction from Word documents.
|
||||
*/
|
||||
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
|
||||
{
|
||||
if (!MimetypeMap.MIMETYPE_WORD.equals(sourceMimetype) ||
|
||||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
||||
{
|
||||
// only support DOC -> Text
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
return true;
|
||||
}
|
||||
{
|
||||
super(new String[] {
|
||||
MimetypeMap.MIMETYPE_WORD
|
||||
});
|
||||
}
|
||||
|
||||
public void transformInternal(ContentReader reader, ContentWriter writer, TransformationOptions options)
|
||||
throws Exception
|
||||
{
|
||||
POIOLE2TextExtractor extractor = null;
|
||||
InputStream is = null;
|
||||
String text = null;
|
||||
try
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
POIFSFileSystem fs = new POIFSFileSystem(is);
|
||||
try {
|
||||
extractor = new WordExtractor(fs);
|
||||
} catch(OldWordFileFormatException e) {
|
||||
extractor = new Word6Extractor(fs);
|
||||
}
|
||||
text = extractor.getText();
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
// check if this is an error caused by the fact that the .doc is in fact
|
||||
// one of Word's temp non-documents
|
||||
if (e.getMessage().contains("Unable to read entire header"))
|
||||
{
|
||||
// just assign an empty string
|
||||
text = "";
|
||||
}
|
||||
else
|
||||
{
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
is.close();
|
||||
}
|
||||
}
|
||||
// dump the text out. This will close the writer automatically.
|
||||
writer.putContent(text);
|
||||
@Override
|
||||
protected Parser getParser() {
|
||||
return new OfficeParser();
|
||||
}
|
||||
}
|
||||
|
@@ -20,6 +20,7 @@ package org.alfresco.repo.content.transform;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.Parser;
|
||||
@@ -37,6 +38,9 @@ import org.apache.tika.parser.Parser;
|
||||
*/
|
||||
public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
|
||||
{
|
||||
private static AutoDetectParser parser;
|
||||
private static TikaConfig config;
|
||||
|
||||
/**
|
||||
* We support all the mimetypes that the Tika
|
||||
* auto-detect parser can handle, except for
|
||||
@@ -44,10 +48,13 @@ public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
|
||||
* make much sense
|
||||
*/
|
||||
public static ArrayList<String> SUPPORTED_MIMETYPES;
|
||||
static {
|
||||
private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig)
|
||||
{
|
||||
config = tikaConfig;
|
||||
parser = new AutoDetectParser(config);
|
||||
|
||||
SUPPORTED_MIMETYPES = new ArrayList<String>();
|
||||
AutoDetectParser p = new AutoDetectParser();
|
||||
for(MediaType mt : p.getParsers().keySet()) {
|
||||
for(MediaType mt : parser.getParsers().keySet()) {
|
||||
if(mt.toString().startsWith("application/vnd.oasis.opendocument.formula")) {
|
||||
// TODO Tika support for quick.odf, mimetype=application/vnd.oasis.opendocument.formula
|
||||
// TODO Tika support for quick.otf, mimetype=application/vnd.oasis.opendocument.formula-template
|
||||
@@ -85,11 +92,12 @@ public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
|
||||
SUPPORTED_MIMETYPES.add( mt.toString() );
|
||||
}
|
||||
}
|
||||
return SUPPORTED_MIMETYPES;
|
||||
}
|
||||
|
||||
public TikaAutoContentTransformer()
|
||||
public TikaAutoContentTransformer(TikaConfig tikaConfig)
|
||||
{
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
super( buildMimeTypes(tikaConfig) );
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -100,6 +108,6 @@ public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
|
||||
*/
|
||||
protected Parser getParser()
|
||||
{
|
||||
return new AutoDetectParser();
|
||||
return parser;
|
||||
}
|
||||
}
|
||||
|
@@ -20,6 +20,7 @@ package org.alfresco.repo.content.transform;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.TransformationOptions;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
|
||||
/**
|
||||
* Most of the work for testing the Tika Auto-Detect transformer
|
||||
@@ -38,7 +39,8 @@ public class TikaAutoContentTransformerTest extends TikaPoweredContentTransforme
|
||||
{
|
||||
super.setUp();
|
||||
|
||||
transformer = new TikaAutoContentTransformer();
|
||||
TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
|
||||
transformer = new TikaAutoContentTransformer( config );
|
||||
}
|
||||
|
||||
/**
|
||||
|
@@ -79,18 +79,10 @@ public class TikaPoweredContainerExtractor
|
||||
private NodeService nodeService;
|
||||
private ContentService contentService;
|
||||
|
||||
private TikaConfig config;
|
||||
private AutoDetectParser parser;
|
||||
private Detector detector;
|
||||
|
||||
public TikaPoweredContainerExtractor()
|
||||
{
|
||||
TikaConfig config = TikaConfig.getDefaultConfig();
|
||||
detector = new ContainerAwareDetector(
|
||||
config.getMimeRepository()
|
||||
);
|
||||
parser = new AutoDetectParser(detector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Injects the nodeService bean.
|
||||
*
|
||||
@@ -110,6 +102,22 @@ public class TikaPoweredContainerExtractor
|
||||
{
|
||||
this.contentService = contentService;
|
||||
}
|
||||
|
||||
/**
|
||||
* Injects the TikaConfig to use
|
||||
*
|
||||
* @param tikaConfig The Tika Config to use
|
||||
*/
|
||||
public void setTikaConfig(TikaConfig tikaConfig)
|
||||
{
|
||||
this.config = tikaConfig;
|
||||
|
||||
// Setup the detector and parser
|
||||
detector = new ContainerAwareDetector(
|
||||
config.getMimeRepository()
|
||||
);
|
||||
parser = new AutoDetectParser(detector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts out all the entries from the container
|
||||
@@ -277,6 +285,9 @@ public class TikaPoweredContainerExtractor
|
||||
<property name="contentService">
|
||||
<ref bean="ContentService" />
|
||||
</property>
|
||||
<property name="tikaConfig">
|
||||
<bean class="org.apache.tika.config.TikaConfig" factory-method="getDefaultConfig" />
|
||||
</property>
|
||||
</bean>
|
||||
<bean id="extractEmbeddedResources" class="org.alfresco.repo.content.transform.TikaPoweredContainerExtractor$ExtractorActionExecutor" parent="action-executer">
|
||||
<property name="tikaPoweredContainerExtractor">
|
||||
|
Reference in New Issue
Block a user