();
+ OfficeParser p = new OfficeParser();
+ for(MediaType mt : p.getSupportedTypes(null)) {
+ if(mt.toString().equals(MimetypeMap.MIMETYPE_EXCEL))
+ {
+ // Skip, handled elsewhere
+ continue;
+ }
+ // Tika can probably do some useful text
+ SUPPORTED_MIMETYPES.add( mt.toString() );
+ }
+ }
+
+ public PoiContentTransformer() {
+ super(SUPPORTED_MIMETYPES);
+ }
+
+ @Override
+ protected Parser getParser() {
+ return new OfficeParser();
+ }
+}
diff --git a/source/java/org/alfresco/repo/content/transform/PoiContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/PoiContentTransformerTest.java
new file mode 100644
index 0000000000..28e49c261a
--- /dev/null
+++ b/source/java/org/alfresco/repo/content/transform/PoiContentTransformerTest.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2005-2010 Alfresco Software Limited.
+ *
+ * This file is part of Alfresco
+ *
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see .
+ */
+package org.alfresco.repo.content.transform;
+
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.TransformationOptions;
+
+/**
+ * @see org.alfresco.repo.content.transform.PoiContentTransformer
+ *
+ * @author Nick Burch
+ */
+public class PoiContentTransformerTest extends AbstractContentTransformerTest
+{
+ private ContentTransformer transformer;
+
+ @Override
+ public void setUp() throws Exception
+ {
+ super.setUp();
+
+ transformer = new PoiContentTransformer();
+ }
+
+ /**
+ * @return Returns the same transformer regardless - it is allowed
+ */
+ protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
+ {
+ return transformer;
+ }
+
+ public void testIsTransformable() throws Exception
+ {
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_WORD, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_PPT, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_PPT, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_TEXT_PLAIN, MimetypeMap.MIMETYPE_OUTLOOK_MSG, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_HTML, new TransformationOptions()));
+ assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_OUTLOOK_MSG, MimetypeMap.MIMETYPE_XML, new TransformationOptions()));
+
+ // Doesn't claim excel
+ assertFalse(transformer.isTransformable(MimetypeMap.MIMETYPE_EXCEL, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
+ }
+}
diff --git a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
index 5bc453e5c0..b62fb77d3e 100644
--- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformer.java
@@ -42,8 +42,6 @@ import org.xml.sax.SAXException;
* Xml or Text (space or comma separated)
* Handles all sheets in the file.
*
- * TODO CSV Support
- *
* @author Nick Burch
* @author Derek Hulley
*/
@@ -103,7 +101,6 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
* A wrapper around the normal Tika BodyContentHandler,
* which causes things to be CSV encoded rather than
* tab separated
- * TODO Get rid of the extra tabs that crop up
*/
protected static class CsvContentHandler extends BodyContentHandler {
private static final char[] comma = new char[]{ ',' };
@@ -116,6 +113,16 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
super(output);
}
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ if(length == 1 && ch[0] == '\t') {
+ // Ignore tabs, as they mess up the CSV output
+ } else {
+ super.ignorableWhitespace(ch, start, length);
+ }
+ }
+
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
@@ -150,32 +157,28 @@ public class PoiHssfContentTransformer extends TikaPoweredContentTransformer
public void startElement(String uri, String localName, String name,
Attributes atts) throws SAXException {
if(localName.equals("td")) {
- localName = "span";
- name = "span";
-
inCell = true;
if(needsComma) {
super.characters(comma, 0, 1);
needsComma = true;
}
+ } else {
+ super.startElement(uri, localName, name, atts);
}
- super.startElement(uri, localName, name, atts);
}
@Override
public void endElement(String uri, String localName, String name)
throws SAXException {
if(localName.equals("td")) {
- localName = "span";
- name = "span";
-
needsComma = true;
inCell = false;
+ } else {
+ if(localName.equals("tr")) {
+ needsComma = false;
+ }
+ super.endElement(uri, localName, name);
}
- if(localName.equals("tr")) {
- needsComma = false;
- }
- super.endElement(uri, localName, name);
}
}
}
diff --git a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
index 651c5c8bf3..dcc7d47fb7 100644
--- a/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
+++ b/source/java/org/alfresco/repo/content/transform/PoiHssfContentTransformerTest.java
@@ -79,14 +79,26 @@ public class PoiHssfContentTransformerTest extends TikaPoweredContentTransformer
ContentReader targetReader = targetWriter.getReader();
String checkContent = targetReader.getContentString();
- System.err.println(checkContent);
+
+ additionalContentCheck(
+ MimetypeMap.MIMETYPE_EXCEL,
+ MimetypeMap.MIMETYPE_TEXT_CSV,
+ checkContent
+ );
}
@Override
protected void additionalContentCheck(String sourceMimetype,
String targetMimetype, String contents) {
if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_CSV)) {
- System.err.println(contents);
+ assertTrue(
+ "Content not properly CSV'd",
+ contents.contains("1,2,2")
+ );
+ assertTrue(
+ "Content not properly CSV'd",
+ contents.contains("\"The\",\"quick\",\"brown\",\"fox\"")
+ );
} else {
super.additionalContentCheck(sourceMimetype, targetMimetype, contents);
}
diff --git a/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java b/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java
index 98af61b069..0872a85d88 100644
--- a/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java
+++ b/source/java/org/alfresco/repo/content/transform/TextMiningContentTransformer.java
@@ -32,6 +32,11 @@ import org.textmining.extraction.word.WordTextExtractorFactory;
* Makes use of the {@link http://www.textmining.org/ TextMining} library to
* perform conversions from MSWord documents to text.
*
+ * Doesn't currently use {@link http://tika.apache.org/ Apache Tika} to
+ * do this, as Tika can't handle Word 6 or Word 95 documents, only
+ * Word 97, 2000, 2003, 2007 and 2010.
+ * Once Tika does support these older formats, we can switch to it.
+ *
* @author Derek Hulley
*/
public class TextMiningContentTransformer extends AbstractContentTransformer2
diff --git a/source/test-resources/quick/quick6.doc b/source/test-resources/quick/quick6.doc
new file mode 100644
index 0000000000..a614a0783f
Binary files /dev/null and b/source/test-resources/quick/quick6.doc differ
diff --git a/source/test-resources/quick/quick95.doc b/source/test-resources/quick/quick95.doc
new file mode 100644
index 0000000000..a614a0783f
Binary files /dev/null and b/source/test-resources/quick/quick95.doc differ