mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-08-07 17:49:17 +00:00
Add a rendition option to only output the body contents, rather than full html, when doing .docx -> .html transformations
git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22557 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -24,6 +24,7 @@ import java.io.InputStream;
|
|||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.io.Writer;
|
import java.io.Writer;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@@ -36,7 +37,10 @@ import javax.xml.transform.sax.TransformerHandler;
|
|||||||
import javax.xml.transform.stream.StreamResult;
|
import javax.xml.transform.stream.StreamResult;
|
||||||
|
|
||||||
import org.alfresco.model.ContentModel;
|
import org.alfresco.model.ContentModel;
|
||||||
|
import org.alfresco.repo.action.ParameterDefinitionImpl;
|
||||||
import org.alfresco.repo.rendition.RenditionLocation;
|
import org.alfresco.repo.rendition.RenditionLocation;
|
||||||
|
import org.alfresco.service.cmr.action.ParameterDefinition;
|
||||||
|
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
|
||||||
import org.alfresco.service.cmr.dictionary.DictionaryService;
|
import org.alfresco.service.cmr.dictionary.DictionaryService;
|
||||||
import org.alfresco.service.cmr.rendition.RenditionServiceException;
|
import org.alfresco.service.cmr.rendition.RenditionServiceException;
|
||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
@@ -46,18 +50,16 @@ import org.alfresco.service.cmr.repository.NodeRef;
|
|||||||
import org.alfresco.service.namespace.QName;
|
import org.alfresco.service.namespace.QName;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.tika.config.TikaConfig;
|
|
||||||
import org.apache.tika.detect.ContainerAwareDetector;
|
|
||||||
import org.apache.tika.exception.TikaException;
|
import org.apache.tika.exception.TikaException;
|
||||||
import org.apache.tika.io.TikaInputStream;
|
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.apache.tika.mime.MediaType;
|
import org.apache.tika.mime.MediaType;
|
||||||
import org.apache.tika.parser.AutoDetectParser;
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
import org.apache.tika.parser.ParseContext;
|
import org.apache.tika.parser.ParseContext;
|
||||||
import org.apache.tika.parser.Parser;
|
import org.apache.tika.parser.Parser;
|
||||||
|
import org.apache.tika.sax.BodyContentHandler;
|
||||||
|
import org.apache.tika.sax.ContentHandlerDecorator;
|
||||||
import org.xml.sax.Attributes;
|
import org.xml.sax.Attributes;
|
||||||
import org.xml.sax.ContentHandler;
|
import org.xml.sax.ContentHandler;
|
||||||
import org.xml.sax.Locator;
|
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
import org.xml.sax.helpers.AttributesImpl;
|
import org.xml.sax.helpers.AttributesImpl;
|
||||||
|
|
||||||
@@ -79,6 +81,13 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
|
|||||||
{
|
{
|
||||||
private static Log logger = LogFactory.getLog(HTMLRenderingEngine.class);
|
private static Log logger = LogFactory.getLog(HTMLRenderingEngine.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This optional parameter, when set to true, causes only the
|
||||||
|
* contents of the HTML body to be written out as the rendition.
|
||||||
|
* By default, the whole of the HTML document is used.
|
||||||
|
*/
|
||||||
|
public static final String PARAM_BODY_CONTENTS_ONLY = "bodyContentsOnly";
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Action constants
|
* Action constants
|
||||||
*/
|
*/
|
||||||
@@ -95,6 +104,16 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
|
|||||||
this.dictionaryService = dictionaryService;
|
this.dictionaryService = dictionaryService;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Collection<ParameterDefinition> getParameterDefinitions() {
|
||||||
|
Collection<ParameterDefinition> paramList = super.getParameterDefinitions();
|
||||||
|
paramList.add(new ParameterDefinitionImpl(PARAM_BODY_CONTENTS_ONLY, DataTypeDefinition.BOOLEAN, false,
|
||||||
|
getParamDisplayLabel(PARAM_BODY_CONTENTS_ONLY)));
|
||||||
|
return paramList;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* (non-Javadoc)
|
* (non-Javadoc)
|
||||||
* @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext)
|
* @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext)
|
||||||
@@ -229,8 +248,9 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
|
|||||||
* Builds a Tika-compatible SAX content handler, which will
|
* Builds a Tika-compatible SAX content handler, which will
|
||||||
* be used to generate+capture the XHTML
|
* be used to generate+capture the XHTML
|
||||||
*/
|
*/
|
||||||
private ContentHandler buildContentHandler(Writer output)
|
private ContentHandler buildContentHandler(Writer output, RenderingContext context)
|
||||||
{
|
{
|
||||||
|
// Create the main transformer
|
||||||
SAXTransformerFactory factory = (SAXTransformerFactory)
|
SAXTransformerFactory factory = (SAXTransformerFactory)
|
||||||
SAXTransformerFactory.newInstance();
|
SAXTransformerFactory.newInstance();
|
||||||
TransformerHandler handler;
|
TransformerHandler handler;
|
||||||
@@ -245,7 +265,19 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
|
|||||||
handler.setResult(new StreamResult(output));
|
handler.setResult(new StreamResult(output));
|
||||||
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
|
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
|
||||||
|
|
||||||
return handler;
|
// Change the image links as they go past
|
||||||
|
ContentHandler contentHandler = new TikaImageRewritingContentHandler(
|
||||||
|
handler, getImagesDirectoryName(context)
|
||||||
|
);
|
||||||
|
|
||||||
|
// If required, wrap it to only return the body
|
||||||
|
boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false);
|
||||||
|
if(bodyOnly) {
|
||||||
|
contentHandler = new BodyContentHandler(contentHandler);
|
||||||
|
}
|
||||||
|
|
||||||
|
// All done
|
||||||
|
return contentHandler;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -257,10 +289,7 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
|
|||||||
|
|
||||||
// Setup things to parse with
|
// Setup things to parse with
|
||||||
StringWriter sw = new StringWriter();
|
StringWriter sw = new StringWriter();
|
||||||
ContentHandler handler = new TikaImageRewritingContentHandler(
|
ContentHandler handler = buildContentHandler(sw, context);
|
||||||
buildContentHandler(sw),
|
|
||||||
getImagesDirectoryName(context)
|
|
||||||
);
|
|
||||||
|
|
||||||
// Tell Tika what we're dealing with
|
// Tell Tika what we're dealing with
|
||||||
Metadata metadata = new Metadata();
|
Metadata metadata = new Metadata();
|
||||||
@@ -290,10 +319,24 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
|
|||||||
throw new RenditionServiceException("Tika HTML Conversion Failed", e);
|
throw new RenditionServiceException("Tika HTML Conversion Failed", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// As a string
|
||||||
|
String html = sw.toString();
|
||||||
|
|
||||||
|
// If we're doing body-only, remove all the html namespaces
|
||||||
|
// that will otherwise clutter up the document
|
||||||
|
boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false);
|
||||||
|
if(bodyOnly) {
|
||||||
|
html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"","<p");
|
||||||
|
html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"","<h\\1");
|
||||||
|
html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"","<div");
|
||||||
|
html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"","<table");
|
||||||
|
html = html.replaceAll(" ","");
|
||||||
|
}
|
||||||
|
|
||||||
// Save it
|
// Save it
|
||||||
ContentWriter contentWriter = context.makeContentWriter();
|
ContentWriter contentWriter = context.makeContentWriter();
|
||||||
contentWriter.setMimetype("text/html");
|
contentWriter.setMimetype("text/html");
|
||||||
contentWriter.putContent( sw.toString() );
|
contentWriter.putContent( html );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -387,12 +430,11 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
|
|||||||
* A content handler that re-writes image src attributes,
|
* A content handler that re-writes image src attributes,
|
||||||
* and passes everything else on to the real one.
|
* and passes everything else on to the real one.
|
||||||
*/
|
*/
|
||||||
private class TikaImageRewritingContentHandler implements ContentHandler {
|
private class TikaImageRewritingContentHandler extends ContentHandlerDecorator {
|
||||||
private ContentHandler handler;
|
|
||||||
private String imageFolder;
|
private String imageFolder;
|
||||||
|
|
||||||
private TikaImageRewritingContentHandler(ContentHandler handler, String imageFolder) {
|
private TikaImageRewritingContentHandler(ContentHandler handler, String imageFolder) {
|
||||||
this.handler = handler;
|
super(handler);
|
||||||
this.imageFolder = imageFolder;
|
this.imageFolder = imageFolder;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -419,58 +461,11 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
handler.startElement(uri, localName, qName, attrs);
|
super.startElement(uri, localName, qName, attrs);
|
||||||
} else {
|
} else {
|
||||||
// For any other tag, pass through as-is
|
// For any other tag, pass through as-is
|
||||||
handler.startElement(uri, localName, qName, origAttrs);
|
super.startElement(uri, localName, qName, origAttrs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void characters(char[] ch, int start, int length)
|
|
||||||
throws SAXException {
|
|
||||||
handler.characters(ch, start, length);
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void ignorableWhitespace(char[] ch, int start, int length)
|
|
||||||
throws SAXException {
|
|
||||||
handler.ignorableWhitespace(ch, start, length);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void endDocument() throws SAXException {
|
|
||||||
handler.endDocument();
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void endElement(String uri, String localName, String qName)
|
|
||||||
throws SAXException {
|
|
||||||
handler.endElement(uri, localName, qName);
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void endPrefixMapping(String prefix) throws SAXException {
|
|
||||||
handler.endPrefixMapping(prefix);
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void processingInstruction(String target, String data)
|
|
||||||
throws SAXException {
|
|
||||||
handler.processingInstruction(target, data);
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void setDocumentLocator(Locator locator) {
|
|
||||||
handler.setDocumentLocator(locator);
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void skippedEntity(String name) throws SAXException {
|
|
||||||
handler.skippedEntity(name);
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void startDocument() throws SAXException {
|
|
||||||
handler.startDocument();
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public void startPrefixMapping(String prefix, String uri)
|
|
||||||
throws SAXException {
|
|
||||||
handler.startPrefixMapping(prefix, uri);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -194,6 +194,35 @@ public class HTMLRenderingEngineTest extends BaseAlfrescoSpringTest
|
|||||||
);
|
);
|
||||||
String html = reader.getContentString();
|
String html = reader.getContentString();
|
||||||
assertEquals("<?xml", html.substring(0, 5));
|
assertEquals("<?xml", html.substring(0, 5));
|
||||||
|
assertTrue("HTML wrong:\n"+html, html.contains("<html"));
|
||||||
|
assertTrue("HTML wrong:\n"+html, html.contains("<head>"));
|
||||||
|
assertTrue("HTML wrong:\n"+html, html.contains("<body>"));
|
||||||
|
|
||||||
|
assertTrue("HTML wrong:\n"+html, html.contains("<p>The quick brown fox"));
|
||||||
|
|
||||||
|
|
||||||
|
// Now do a body-only one, check that we still got the
|
||||||
|
// contents, but not the html surround
|
||||||
|
def.setParameterValue(
|
||||||
|
HTMLRenderingEngine.PARAM_BODY_CONTENTS_ONLY, Boolean.TRUE
|
||||||
|
);
|
||||||
|
rendition = renditionService.render(sourceDoc, def);
|
||||||
|
assertNotNull(rendition);
|
||||||
|
|
||||||
|
htmlNode = rendition.getChildRef();
|
||||||
|
assertEquals(true, nodeService.exists(htmlNode));
|
||||||
|
|
||||||
|
reader = contentService.getReader(
|
||||||
|
htmlNode, ContentModel.PROP_CONTENT
|
||||||
|
);
|
||||||
|
html = reader.getContentString();
|
||||||
|
assertEquals("<?xml", html.substring(0, 5));
|
||||||
|
assertFalse("Body wrong:\n"+html, html.contains("<html"));
|
||||||
|
assertFalse("Body wrong:\n"+html, html.contains("<head>"));
|
||||||
|
assertFalse("Body wrong:\n"+html, html.contains("<body>"));
|
||||||
|
|
||||||
|
assertTrue("HTML wrong:\n"+html, html.contains("<p>The quick brown fox"));
|
||||||
|
assertTrue("HTML wrong:\n"+html, html.contains("</p>"));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Reference in New Issue
Block a user