Merged DEV/SWIFT to HEAD (FTP Tests, Tika and Poi)

26059: ALF-5900 - IMAP creates winmail.dat in attachment folder (Add support for Microsoft Transport Neutral Encapsulation Format.) - added attachment extraction for TNEF documents - goodbye winmail.dat ! 26063: javadoc for imap. 26088: ALF-7408 - addition of commons-net for ftp client library. First test of end to end ftp. Just a simple test of connection now, will be followed by more detailed tests. 26176: ALF-7408 - FTP tests + disabled failing test case for ALF-7618 26180: ALF-7618 - correction of unit test error. 26188: ALF-7618 - added a test of paths 26229: Added back simple '\~.*' pattern 26288: ALF-7676 - Test to stress different user rights. - FTPServerTest.testTwoUserUpdate added for the FTP server. 26304: Corrected spelling name in private class. 26408: addming minimal package infos. 26416: ALF-5082 / ALF-2183 / ALF-4448 - When guessing the mimetype for a file, add the option to supply a ContentReader to enhance the accuracy. Enable this for a few key places that do mimetype guessing, which should avoid issues for files with the wrong extension (either renamed accidently, or for .TMP) 26433: Re-order the mimetype guess step to ensure that the Content Reader is always valid 26440: Added another test for word 2003 save as. 26441: Test resource for ContentDiskDriver 26446: ALF-5082 - Back out a FileFolderService change to mimetype guessing, which had broken things, pending a better way to do it with ContentWriter 26490: Small change for ContentDiskDriverTes.fileExists. Leaky transaction causing problems in automated build. 26497: ContentDiskDriver - commented out two of the problematic leaky transaction tests. 26503: Add new interface methods + documentation for asking a ContentWriter to guess the mimetype and encoding for you. (Code will be migrated from places that currently do this themselves later) 26504: Add an extension interface in the DataModel project for some of the extra ContentReader methods that FileContentReader provides 26505: When ContentWriter.putContent(String) is called with no encoding specified, record what the system default encoding was that was used. (Prevents issues if the system default is ever changed) 26509: When calling Tika to do file detection, if we have a file based reader then give Tika the File rather than an InputStream 26522: More debug logging while debugging ALF-5260 26546: Have one copy of the Tika Config in spring, rather than several places fetching their own copy of the default one (either explicitly or implicitly). 26522: More debug logging while diagnosing ALF-5260 26548: Add another mimetype check - ensures that truncated/corrup container files which can't be fully processed can still get the container type without failure 26549: Implement the mimetype and encoding guessers on ContentWriter (either immediately or as a listener, as required), and update FileFolderServer to make use of this (+test this) 26553: Replace explicit mimetype and encoding guess calls with ContentWriter requests to have the work done 26554: Replace explicit mimetype and encoding guess calls with ContentWriter requests to have the work done 26579: Switch the transformer to use Tika git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@28224 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-08-07 17:49:17 +00:00 · 2011-06-07 07:36:37 +00:00
parent 04aef409a8
commit e118211bd3
50 changed files with 2269 additions and 365 deletions
--- a/source/java/org/alfresco/repo/content/AbstractContentWriter.java
+++ b/source/java/org/alfresco/repo/content/AbstractContentWriter.java
@@ -29,16 +29,19 @@ import java.nio.channels.Channels;
 import java.nio.channels.FileChannel;
 import java.nio.channels.ReadableByteChannel;
 import java.nio.channels.WritableByteChannel;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;

 import org.alfresco.error.AlfrescoRuntimeException;
+import org.alfresco.repo.content.encoding.ContentCharsetFinder;
 import org.alfresco.repo.content.filestore.FileContentWriter;
 import org.alfresco.service.cmr.repository.ContentAccessor;
 import org.alfresco.service.cmr.repository.ContentIOException;
 import org.alfresco.service.cmr.repository.ContentReader;
 import org.alfresco.service.cmr.repository.ContentStreamListener;
 import org.alfresco.service.cmr.repository.ContentWriter;
+import org.alfresco.service.cmr.repository.MimetypeService;
 import org.alfresco.util.TempFileProvider;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -62,6 +65,8 @@ public abstract class AbstractContentWriter extends AbstractContentAccessor impl
    private List<ContentStreamListener> listeners;
    private WritableByteChannel channel;
    private ContentReader existingContentReader;
+    private MimetypeService mimetypeService;
+    private DoGuessingOnCloseListener guessingOnCloseListener;
    
    /**
     * @param contentUrl the content URL
@@ -73,6 +78,21 @@ public abstract class AbstractContentWriter extends AbstractContentAccessor impl
        this.existingContentReader = existingContentReader;
        
        listeners = new ArrayList<ContentStreamListener>(2);
+        
+        // We always register our own listener as the first one
+        // This allows us to perform any guessing (if needed) before
+        //  the normal listeners kick in and eg write things to the DB
+        guessingOnCloseListener = new DoGuessingOnCloseListener();
+        listeners.add(guessingOnCloseListener);
+    }
+    
+    /**
+     * Supplies the Mimetype Service to be used when guessing
+     *  encoding and mimetype information. 
+     */
+    public void setMimetypeService(MimetypeService mimetypeService)
+    {
+        this.mimetypeService = mimetypeService;
    }

    /**
@@ -454,7 +474,19 @@ public abstract class AbstractContentWriter extends AbstractContentAccessor impl
        {
            // attempt to use the correct encoding
            String encoding = getEncoding();
-            byte[] bytes = (encoding == null) ? content.getBytes() : content.getBytes(encoding);
+            byte[] bytes;
+            if(encoding == null) 
+            {
+                // Use the system default, and record what that was
+                bytes = content.getBytes();
+                setEncoding( System.getProperty("file.encoding") );
+            }
+            else
+            {
+                // Use the encoding that they specified
+                bytes = content.getBytes(encoding);
+            }
+
            // get the stream
            OutputStream os = getContentOutputStream();
            ByteArrayInputStream is = new ByteArrayInputStream(bytes);
@@ -469,4 +501,108 @@ public abstract class AbstractContentWriter extends AbstractContentAccessor impl
                    e);
        }
    }
+    
+    /**
+     * When the content has been written, attempt to guess
+     *  the encoding of it.
+     *  
+     * @see ContentWriter#guessEncoding()
+     */
+    public void guessEncoding()
+    {
+        if (mimetypeService == null)
+        {
+            logger.warn("MimetypeService not supplied, but required for content guessing");
+            return;
+        }
+        
+        if(isClosed())
+        {
+            // Content written, can do it now
+            doGuessEncoding();
+        }
+        else
+        {
+            // Content not yet written, wait for the
+            //  data to be written before doing so
+            guessingOnCloseListener.guessEncoding = true;
+        }
+    }
+    private void doGuessEncoding()
+    {
+        ContentCharsetFinder charsetFinder = mimetypeService.getContentCharsetFinder();
+        
+        ContentReader reader = getReader();
+        InputStream is = reader.getContentInputStream();
+        Charset charset = charsetFinder.getCharset(is, getMimetype());
+        try
+        {
+            is.close();
+        }
+        catch(IOException e)
+        {}
+        
+        setEncoding(charset.name());
+    }
+
+    /**
+     * When the content has been written, attempt to guess
+     *  the mimetype of it, using the filename and contents.
+     *  
+     * @see ContentWriter#guessMimetype(String)
+     */
+    public void guessMimetype(String filename)
+    {
+        if (mimetypeService == null)
+        {
+            logger.warn("MimetypeService not supplied, but required for content guessing");
+            return;
+        }
+        
+        
+        if(isClosed())
+        {
+            // Content written, can do it now
+            doGuessMimetype(filename);
+        }
+        else
+        {
+            // Content not yet written, wait for the
+            //  data to be written before doing so
+            guessingOnCloseListener.guessMimetype = true;
+            guessingOnCloseListener.filename = filename;
+        }
+    }
+    private void doGuessMimetype(String filename)
+    {
+        String mimetype = mimetypeService.guessMimetype(
+                filename, getReader()
+        );
+        setMimetype(mimetype);
+    }
+    
+    /**
+     * Our own listener that is always the first on the list,
+     *  which lets us perform guessing operations when the
+     *  content has been written.
+     */
+    private class DoGuessingOnCloseListener implements ContentStreamListener
+    {
+        private boolean guessEncoding = false;
+        private boolean guessMimetype = false;
+        private String filename = null;
+
+        @Override
+        public void contentStreamClosed() throws ContentIOException
+        {
+            if(guessMimetype)
+            {
+                doGuessMimetype(filename);
+            }
+            if(guessEncoding)
+            {
+                doGuessEncoding();
+            }
+        }
+    }
 }
--- a/source/java/org/alfresco/repo/content/ContentServiceImpl.java
+++ b/source/java/org/alfresco/repo/content/ContentServiceImpl.java
@@ -50,6 +50,7 @@ import org.alfresco.service.cmr.repository.ContentIOException;
 import org.alfresco.service.cmr.repository.ContentReader;
 import org.alfresco.service.cmr.repository.ContentService;
 import org.alfresco.service.cmr.repository.ContentWriter;
+import org.alfresco.service.cmr.repository.MimetypeService;
 import org.alfresco.service.cmr.repository.NoTransformerException;
 import org.alfresco.service.cmr.repository.NodeRef;
 import org.alfresco.service.cmr.repository.NodeService;
@@ -85,6 +86,7 @@ public class ContentServiceImpl implements ContentService, ApplicationContextAwa
    private DictionaryService dictionaryService;
    private NodeService nodeService;
    private AVMService avmService;
+    private MimetypeService mimetypeService;
    private RetryingTransactionHelper transactionHelper;
    private ApplicationContext applicationContext;
    
@@ -127,6 +129,11 @@ public class ContentServiceImpl implements ContentService, ApplicationContextAwa
        this.nodeService = nodeService;
    }
    
+    public void setMimetypeService(MimetypeService mimetypeService)
+    {
+        this.mimetypeService = mimetypeService;
+    }
+    
    public void setTransformerRegistry(ContentTransformerRegistry transformerRegistry)
    {
        this.transformerRegistry = transformerRegistry;
@@ -492,6 +499,12 @@ public class ContentServiceImpl implements ContentService, ApplicationContextAwa
            
        }
        
+        // supply the writer with a copy of the mimetype service if needed
+        if (writer instanceof AbstractContentWriter)
+        {
+            ((AbstractContentWriter)writer).setMimetypeService(mimetypeService);
+        }
+        
        // give back to the client
        return writer;
    }
--- a/source/java/org/alfresco/repo/content/MimetypeMapContentTest.java
+++ b/source/java/org/alfresco/repo/content/MimetypeMapContentTest.java
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.alfresco.repo.content;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.net.URL;
+
+import junit.framework.TestCase;
+
+import org.alfresco.repo.content.filestore.FileContentReader;
+import org.alfresco.service.cmr.repository.ContentReader;
+import org.alfresco.service.cmr.repository.MimetypeService;
+import org.alfresco.util.DataModelTestApplicationContextHelper;
+import org.apache.poi.util.IOUtils;
+import org.springframework.context.ApplicationContext;
+
+/**
+ * Content specific tests for MimeTypeMap
+ * 
+ * @see org.alfresco.repo.content.MimetypeMap
+ * @see org.alfresco.repo.content.MimetypeMapTest
+ */
+public class MimetypeMapContentTest extends TestCase
+{
+    private static ApplicationContext ctx = DataModelTestApplicationContextHelper.getApplicationContext();
+    
+    private MimetypeService mimetypeService;
+    
+    @Override
+    public void setUp() throws Exception
+    {
+        mimetypeService =  (MimetypeService)ctx.getBean("mimetypeService");
+    }
+
+    public void testGuessMimetypeForFile() throws Exception
+    {
+        // Correct ones
+        assertEquals(
+                "application/msword", 
+                mimetypeService.guessMimetype("something.doc", openQuickTestFile("quick.doc"))
+        );
+        assertEquals(
+                "application/msword", 
+                mimetypeService.guessMimetype("SOMETHING.DOC", openQuickTestFile("quick.doc"))
+        );
+        
+        // Incorrect ones, Tika spots the mistake
+        assertEquals(
+                "application/msword", 
+                mimetypeService.guessMimetype("something.pdf", openQuickTestFile("quick.doc"))
+        );
+        assertEquals(
+                "application/pdf", 
+                mimetypeService.guessMimetype("something.doc", openQuickTestFile("quick.pdf"))
+        );
+        
+        // Ones where we use a different mimetype to the canonical one
+        assertEquals(
+                "image/bmp", // Officially image/x-ms-bmp 
+                mimetypeService.guessMimetype("image.bmp", openQuickTestFile("quick.bmp"))
+        );
+
+        
+        // Where the file is corrupted
+        File tmp = File.createTempFile("alfresco", ".tmp");
+        ContentReader reader = openQuickTestFile("quick.doc");
+        InputStream inp = reader.getContentInputStream();
+        byte[] trunc = new byte[512+256];
+        IOUtils.readFully(inp, trunc);
+        inp.close();
+        FileOutputStream out = new FileOutputStream(tmp);
+        out.write(trunc);
+        out.close();
+        ContentReader truncReader = new FileContentReader(tmp);
+        
+        // Because the file is truncated, Tika won't be able to process the contents
+        //  of the OLE2 structure
+        // So, it'll fall back to just OLE2, but it won't fail
+        assertEquals(
+                "application/x-tika-msoffice", 
+                mimetypeService.guessMimetype("something.doc", truncReader)
+        );
+    }
+    
+    private ContentReader openQuickTestFile(String filename)
+    {
+        URL url = getClass().getClassLoader().getResource("quick/" + filename);
+        File file = new File(url.getFile());
+        return new FileContentReader(file);
+    }
+}
--- a/source/java/org/alfresco/repo/content/filestore/FileContentReader.java
+++ b/source/java/org/alfresco/repo/content/filestore/FileContentReader.java
@@ -45,6 +45,7 @@ import org.apache.commons.logging.LogFactory;
 * @author Derek Hulley
 */
 public class FileContentReader extends AbstractContentReader
+    implements org.alfresco.service.cmr.repository.FileContentReader
 {
    /**
     * message key for missing content.  Parameters are
@@ -147,6 +148,9 @@ public class FileContentReader extends AbstractContentReader
        return file;
    }

+    /**
+     * @return Whether the file exists or not
+     */
    public boolean exists()
    {
        return file.exists();
--- a/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/ArchiveContentTransformer.java
@@ -23,6 +23,7 @@ import java.util.ArrayList;
 import org.alfresco.service.cmr.repository.TransformationOptions;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
@@ -50,22 +51,11 @@ public class ArchiveContentTransformer extends TikaPoweredContentTransformer
    private static Log logger = LogFactory.getLog(ArchiveContentTransformer.class);

    private boolean includeContents = false;
-    public void setIncludeContents(String includeContents)
-    {
-       // Spring really ought to be able to handle
-       //  setting a boolean that might still be
-       //  ${foo} (i.e. not overridden in a property).
-       // As we can't do that with spring, we do it...
-       this.includeContents = false;
-       if(includeContents != null && includeContents.length() > 0)
-       {
-          this.includeContents = TransformationOptions.relaxedBooleanTypeConverter.convert(includeContents).booleanValue();
-       }
-    }
+    private TikaConfig tikaConfig;
    
    /** 
     * We support all the archive mimetypes that the Tika
-     *  office parser can handle
+     *  package parser can handle
     */
    public static ArrayList<String> SUPPORTED_MIMETYPES;
    static {
@@ -81,6 +71,29 @@ public class ArchiveContentTransformer extends TikaPoweredContentTransformer
        super(SUPPORTED_MIMETYPES);
    }
    
+    /**
+     * Injects the TikaConfig to use
+     * 
+     * @param tikaConfig The Tika Config to use 
+     */
+    public void setTikaConfig(TikaConfig tikaConfig)
+    {
+        this.tikaConfig = tikaConfig;
+    }
+    
+    public void setIncludeContents(String includeContents)
+    {
+       // Spring really ought to be able to handle
+       //  setting a boolean that might still be
+       //  ${foo} (i.e. not overridden in a property).
+       // As we can't do that with spring, we do it...
+       this.includeContents = false;
+       if(includeContents != null && includeContents.length() > 0)
+       {
+          this.includeContents = TransformationOptions.relaxedBooleanTypeConverter.convert(includeContents).booleanValue();
+       }
+    }
+    
    @Override
    protected Parser getParser() {
      return new PackageParser();
@@ -96,9 +109,15 @@ public class ArchiveContentTransformer extends TikaPoweredContentTransformer
      {
         recurse = options.getIncludeEmbedded();
      }
+      
      if(recurse)
      {
-         context.set(Parser.class, new AutoDetectParser());
+         // Use an auto detect parser to handle the contents
+         if(tikaConfig == null)
+         {
+             tikaConfig = TikaConfig.getDefaultConfig();
+         }
+         context.set(Parser.class, new AutoDetectParser(tikaConfig));
      }
      
      return context;
--- a/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java
@@ -18,94 +18,27 @@
 */
 package org.alfresco.repo.content.transform;

-import java.io.IOException;
-import java.io.InputStream;
-
 import org.alfresco.repo.content.MimetypeMap;
-import org.alfresco.service.cmr.repository.ContentReader;
-import org.alfresco.service.cmr.repository.ContentWriter;
-import org.alfresco.service.cmr.repository.TransformationOptions;
-import org.apache.poi.POIOLE2TextExtractor;
-import org.apache.poi.hwpf.OldWordFileFormatException;
-import org.apache.poi.hwpf.extractor.Word6Extractor;
-import org.apache.poi.hwpf.extractor.WordExtractor;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.OfficeParser;

 /**
 * This badly named transformer turns Microsoft Word documents
- *  (Word 6, 95, 97, 2000, 2003) into plain text.
- * 
- * Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
- *  do this, pending TIKA-408. When Apache POI 3.7 beta 2 has been
- *  released, we can switch to Tika and then handle Word 6,
- *  Word 95, Word 97, 2000, 2003, 2007 and 2010 formats.
- *  
- * TODO Switch to Tika in November 2010 once 3.4 is out
+ *  (Word 6, 95, 97, 2000, 2003) into plain text, using Apache Tika.
 * 
 * @author Nick Burch
 */
-public class TextMiningContentTransformer extends AbstractContentTransformer2
+public class TextMiningContentTransformer extends TikaPoweredContentTransformer
 {
    public TextMiningContentTransformer()
-    {
-    }
-    
-    /**
-     * Currently the only transformation performed is that of text extraction from Word documents.
-     */
-    public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
-    {
-        if (!MimetypeMap.MIMETYPE_WORD.equals(sourceMimetype) ||
-                !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
-        {
-            // only support DOC -> Text
-            return false;
-        }
-        else
-        {
-            return true;
-        }
+    {       
+        super(new String[] {
+            MimetypeMap.MIMETYPE_WORD
+        });
    }

-    public void transformInternal(ContentReader reader, ContentWriter writer,  TransformationOptions options)
-            throws Exception
-    {
-       POIOLE2TextExtractor extractor = null;
-        InputStream is = null;
-        String text = null;
-        try
-        {
-            is = reader.getContentInputStream();
-            POIFSFileSystem fs = new POIFSFileSystem(is);
-            try {
-               extractor = new WordExtractor(fs);
-            } catch(OldWordFileFormatException e) {
-               extractor = new Word6Extractor(fs);
-            }
-            text = extractor.getText();
-        }
-        catch (IOException e)
-        {
-            // check if this is an error caused by the fact that the .doc is in fact
-            // one of Word's temp non-documents
-            if (e.getMessage().contains("Unable to read entire header"))
-            {
-                // just assign an empty string
-                text = "";
-            }
-            else
-            {
-                throw e;
-            }
-        }
-        finally
-        {
-            if (is != null)
-            {
-                is.close();
-            }
-        }
-        // dump the text out.  This will close the writer automatically.
-        writer.putContent(text);
+    @Override
+    protected Parser getParser() {
+        return new OfficeParser();
    }
 }
--- a/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformer.java
@@ -20,6 +20,7 @@ package org.alfresco.repo.content.transform;

 import java.util.ArrayList;

+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
@@ -37,6 +38,9 @@ import org.apache.tika.parser.Parser;
 */
 public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
 {
+    private static AutoDetectParser parser;
+    private static TikaConfig config;
+
    /** 
     * We support all the mimetypes that the Tika
     *  auto-detect parser can handle, except for
@@ -44,10 +48,13 @@ public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
     *  make much sense
     */
    public static ArrayList<String> SUPPORTED_MIMETYPES;
-    static {
+    private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig)
+    {
+       config = tikaConfig;
+       parser = new AutoDetectParser(config);
+
       SUPPORTED_MIMETYPES = new ArrayList<String>();
-       AutoDetectParser p = new AutoDetectParser();
-       for(MediaType mt : p.getParsers().keySet()) {
+       for(MediaType mt : parser.getParsers().keySet()) {
          if(mt.toString().startsWith("application/vnd.oasis.opendocument.formula")) {
             // TODO Tika support for quick.odf, mimetype=application/vnd.oasis.opendocument.formula
             // TODO Tika support for quick.otf, mimetype=application/vnd.oasis.opendocument.formula-template
@@ -85,11 +92,12 @@ public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
             SUPPORTED_MIMETYPES.add( mt.toString() );
          }
       }
+       return SUPPORTED_MIMETYPES;
    }
   
-    public TikaAutoContentTransformer()
+    public TikaAutoContentTransformer(TikaConfig tikaConfig)
    {
-       super(SUPPORTED_MIMETYPES);
+       super( buildMimeTypes(tikaConfig) );
    }
    
    /**
@@ -100,6 +108,6 @@ public class TikaAutoContentTransformer extends TikaPoweredContentTransformer
     */
    protected Parser getParser()
    {
-       return new AutoDetectParser();
+       return parser;
    }
 }
--- a/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaAutoContentTransformerTest.java
@@ -20,6 +20,7 @@ package org.alfresco.repo.content.transform;

 import org.alfresco.repo.content.MimetypeMap;
 import org.alfresco.service.cmr.repository.TransformationOptions;
+import org.apache.tika.config.TikaConfig;

 /**
 * Most of the work for testing the Tika Auto-Detect transformer
@@ -38,7 +39,8 @@ public class TikaAutoContentTransformerTest extends TikaPoweredContentTransforme
    {
        super.setUp();
        
-        transformer = new TikaAutoContentTransformer();
+        TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
+        transformer = new TikaAutoContentTransformer( config );
    }
    
    /**
--- a/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java
+++ b/source/java/org/alfresco/repo/content/transform/TikaPoweredContainerExtractor.java
@@ -79,18 +79,10 @@ public class TikaPoweredContainerExtractor
    private NodeService nodeService;
    private ContentService contentService;
    
+    private TikaConfig config;
    private AutoDetectParser parser;
    private Detector detector;

-    public TikaPoweredContainerExtractor() 
-    {
-       TikaConfig config = TikaConfig.getDefaultConfig();
-       detector = new ContainerAwareDetector(
-             config.getMimeRepository()
-       );
-       parser = new AutoDetectParser(detector);
-    }
-    
    /**
     * Injects the nodeService bean.
     * 
@@ -110,6 +102,22 @@ public class TikaPoweredContainerExtractor
    {
        this.contentService = contentService;
    }
+    
+    /**
+     * Injects the TikaConfig to use
+     * 
+     * @param tikaConfig The Tika Config to use 
+     */
+    public void setTikaConfig(TikaConfig tikaConfig)
+    {
+        this.config = tikaConfig;
+        
+        // Setup the detector and parser
+        detector = new ContainerAwareDetector(
+                config.getMimeRepository()
+        );
+        parser = new AutoDetectParser(detector);
+    }

    /**
     * Extracts out all the entries from the container
@@ -277,6 +285,9 @@ public class TikaPoweredContainerExtractor
       <property name="contentService">
          <ref bean="ContentService" />
       </property>
+       <property name="tikaConfig">
+          <bean class="org.apache.tika.config.TikaConfig" factory-method="getDefaultConfig" />
+       </property>
   </bean>
   <bean id="extractEmbeddedResources" class="org.alfresco.repo.content.transform.TikaPoweredContainerExtractor$ExtractorActionExecutor" parent="action-executer">
       <property name="tikaPoweredContainerExtractor">