Add a rendition option to only output the body contents, rather than full html, when doing .docx -> .html transformations

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22557 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-09-15 14:40:14 +00:00
parent 8ec73092c8
commit c5ace69b66
2 changed files with 87 additions and 63 deletions

View File

@@ -24,6 +24,7 @@ import java.io.InputStream;
import java.io.Serializable; import java.io.Serializable;
import java.io.StringWriter; import java.io.StringWriter;
import java.io.Writer; import java.io.Writer;
import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map; import java.util.Map;
@@ -36,7 +37,10 @@ import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamResult;
import org.alfresco.model.ContentModel; import org.alfresco.model.ContentModel;
import org.alfresco.repo.action.ParameterDefinitionImpl;
import org.alfresco.repo.rendition.RenditionLocation; import org.alfresco.repo.rendition.RenditionLocation;
import org.alfresco.service.cmr.action.ParameterDefinition;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
import org.alfresco.service.cmr.dictionary.DictionaryService; import org.alfresco.service.cmr.dictionary.DictionaryService;
import org.alfresco.service.cmr.rendition.RenditionServiceException; import org.alfresco.service.cmr.rendition.RenditionServiceException;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
@@ -46,18 +50,16 @@ import org.alfresco.service.cmr.repository.NodeRef;
import org.alfresco.service.namespace.QName; import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.ContainerAwareDetector;
import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser; import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.xml.sax.Attributes; import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler; import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl; import org.xml.sax.helpers.AttributesImpl;
@@ -79,6 +81,13 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
{ {
private static Log logger = LogFactory.getLog(HTMLRenderingEngine.class); private static Log logger = LogFactory.getLog(HTMLRenderingEngine.class);
/**
* This optional parameter, when set to true, causes only the
* contents of the HTML body to be written out as the rendition.
* By default, the whole of the HTML document is used.
*/
public static final String PARAM_BODY_CONTENTS_ONLY = "bodyContentsOnly";
/* /*
* Action constants * Action constants
*/ */
@@ -95,6 +104,16 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
this.dictionaryService = dictionaryService; this.dictionaryService = dictionaryService;
} }
@Override
protected Collection<ParameterDefinition> getParameterDefinitions() {
Collection<ParameterDefinition> paramList = super.getParameterDefinitions();
paramList.add(new ParameterDefinitionImpl(PARAM_BODY_CONTENTS_ONLY, DataTypeDefinition.BOOLEAN, false,
getParamDisplayLabel(PARAM_BODY_CONTENTS_ONLY)));
return paramList;
}
/* /*
* (non-Javadoc) * (non-Javadoc)
* @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext) * @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext)
@@ -229,8 +248,9 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
* Builds a Tika-compatible SAX content handler, which will * Builds a Tika-compatible SAX content handler, which will
* be used to generate+capture the XHTML * be used to generate+capture the XHTML
*/ */
private ContentHandler buildContentHandler(Writer output) private ContentHandler buildContentHandler(Writer output, RenderingContext context)
{ {
// Create the main transformer
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance(); SAXTransformerFactory.newInstance();
TransformerHandler handler; TransformerHandler handler;
@@ -245,7 +265,19 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
handler.setResult(new StreamResult(output)); handler.setResult(new StreamResult(output));
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
return handler; // Change the image links as they go past
ContentHandler contentHandler = new TikaImageRewritingContentHandler(
handler, getImagesDirectoryName(context)
);
// If required, wrap it to only return the body
boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false);
if(bodyOnly) {
contentHandler = new BodyContentHandler(contentHandler);
}
// All done
return contentHandler;
} }
/** /**
@@ -257,10 +289,7 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
// Setup things to parse with // Setup things to parse with
StringWriter sw = new StringWriter(); StringWriter sw = new StringWriter();
ContentHandler handler = new TikaImageRewritingContentHandler( ContentHandler handler = buildContentHandler(sw, context);
buildContentHandler(sw),
getImagesDirectoryName(context)
);
// Tell Tika what we're dealing with // Tell Tika what we're dealing with
Metadata metadata = new Metadata(); Metadata metadata = new Metadata();
@@ -290,10 +319,24 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
throw new RenditionServiceException("Tika HTML Conversion Failed", e); throw new RenditionServiceException("Tika HTML Conversion Failed", e);
} }
// As a string
String html = sw.toString();
// If we're doing body-only, remove all the html namespaces
// that will otherwise clutter up the document
boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false);
if(bodyOnly) {
html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"","<p");
html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"","<h\\1");
html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"","<div");
html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"","<table");
html = html.replaceAll("&#13;","");
}
// Save it // Save it
ContentWriter contentWriter = context.makeContentWriter(); ContentWriter contentWriter = context.makeContentWriter();
contentWriter.setMimetype("text/html"); contentWriter.setMimetype("text/html");
contentWriter.putContent( sw.toString() ); contentWriter.putContent( html );
} }
@@ -387,12 +430,11 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
* A content handler that re-writes image src attributes, * A content handler that re-writes image src attributes,
* and passes everything else on to the real one. * and passes everything else on to the real one.
*/ */
private class TikaImageRewritingContentHandler implements ContentHandler { private class TikaImageRewritingContentHandler extends ContentHandlerDecorator {
private ContentHandler handler;
private String imageFolder; private String imageFolder;
private TikaImageRewritingContentHandler(ContentHandler handler, String imageFolder) { private TikaImageRewritingContentHandler(ContentHandler handler, String imageFolder) {
this.handler = handler; super(handler);
this.imageFolder = imageFolder; this.imageFolder = imageFolder;
} }
@@ -419,58 +461,11 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
} }
} }
} }
handler.startElement(uri, localName, qName, attrs); super.startElement(uri, localName, qName, attrs);
} else { } else {
// For any other tag, pass through as-is // For any other tag, pass through as-is
handler.startElement(uri, localName, qName, origAttrs); super.startElement(uri, localName, qName, origAttrs);
} }
} }
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
handler.characters(ch, start, length);
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
handler.ignorableWhitespace(ch, start, length);
}
@Override
public void endDocument() throws SAXException {
handler.endDocument();
}
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
handler.endElement(uri, localName, qName);
}
@Override
public void endPrefixMapping(String prefix) throws SAXException {
handler.endPrefixMapping(prefix);
}
@Override
public void processingInstruction(String target, String data)
throws SAXException {
handler.processingInstruction(target, data);
}
@Override
public void setDocumentLocator(Locator locator) {
handler.setDocumentLocator(locator);
}
@Override
public void skippedEntity(String name) throws SAXException {
handler.skippedEntity(name);
}
@Override
public void startDocument() throws SAXException {
handler.startDocument();
}
@Override
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
handler.startPrefixMapping(prefix, uri);
}
} }
} }

View File

@@ -194,6 +194,35 @@ public class HTMLRenderingEngineTest extends BaseAlfrescoSpringTest
); );
String html = reader.getContentString(); String html = reader.getContentString();
assertEquals("<?xml", html.substring(0, 5)); assertEquals("<?xml", html.substring(0, 5));
assertTrue("HTML wrong:\n"+html, html.contains("<html"));
assertTrue("HTML wrong:\n"+html, html.contains("<head>"));
assertTrue("HTML wrong:\n"+html, html.contains("<body>"));
assertTrue("HTML wrong:\n"+html, html.contains("<p>The quick brown fox"));
// Now do a body-only one, check that we still got the
// contents, but not the html surround
def.setParameterValue(
HTMLRenderingEngine.PARAM_BODY_CONTENTS_ONLY, Boolean.TRUE
);
rendition = renditionService.render(sourceDoc, def);
assertNotNull(rendition);
htmlNode = rendition.getChildRef();
assertEquals(true, nodeService.exists(htmlNode));
reader = contentService.getReader(
htmlNode, ContentModel.PROP_CONTENT
);
html = reader.getContentString();
assertEquals("<?xml", html.substring(0, 5));
assertFalse("Body wrong:\n"+html, html.contains("<html"));
assertFalse("Body wrong:\n"+html, html.contains("<head>"));
assertFalse("Body wrong:\n"+html, html.contains("<body>"));
assertTrue("HTML wrong:\n"+html, html.contains("<p>The quick brown fox"));
assertTrue("HTML wrong:\n"+html, html.contains("</p>"));
} }
/** /**