From 1311f124b654b8f437af63b5abacb8ad07dcea9b Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Wed, 15 Sep 2010 11:04:14 +0000 Subject: [PATCH] Update the Tika-powered .docx -> html converter to re-write the img src urls from embedded ones to repo-expanded ones git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22546 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- .../executer/HTMLRenderingEngine.java | 152 ++++++++++++++++-- .../executer/HTMLRenderingEngineTest.java | 17 +- 2 files changed, 152 insertions(+), 17 deletions(-) diff --git a/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngine.java b/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngine.java index 1c3519ed85..3241019223 100644 --- a/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngine.java +++ b/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngine.java @@ -46,14 +46,20 @@ import org.alfresco.service.cmr.repository.NodeRef; import org.alfresco.service.namespace.QName; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.ContainerAwareDetector; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; /** * This class provides a way to turn documents supported by the @@ -100,7 +106,7 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine String sourceMimeType = contentReader.getMimetype(); String targetMimeType = "text/html"; - // Check that Tika supports it + // Check that Tika supports the supplied file AutoDetectParser p = new AutoDetectParser(); MediaType sourceMediaType = MediaType.parse(sourceMimeType); if(! p.getParsers().containsKey(sourceMediaType)) @@ -116,6 +122,25 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine generateHTML(p, context); } + /** + * What name should be used for the images directory? + */ + private String getImagesDirectoryName(RenderingContext context) + { + // Based on the name of the source node, which will + // also largely be the name of the html node + String folderName = nodeService.getProperty( + context.getSourceNode(), + ContentModel.PROP_NAME + ).toString(); + if(folderName.lastIndexOf('.') > -1) + { + folderName = folderName.substring(0, folderName.lastIndexOf('.')); + } + folderName = folderName + "_files"; + return folderName; + } + /** * Creates a directory to store the images in. * The directory will be a sibling of the rendered @@ -131,15 +156,7 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine NodeRef parent = location.getParentRef(); // Figure out what to call it, based on the HTML node - String folderName = nodeService.getProperty( - context.getSourceNode(), - ContentModel.PROP_NAME - ).toString(); - if(folderName.lastIndexOf('.') > -1) - { - folderName = folderName.substring(0, folderName.lastIndexOf('.')); - } - folderName = folderName + "_files"; + String folderName = getImagesDirectoryName(context); // It is already there? // (eg from when the rendition is being re-run) @@ -237,10 +254,28 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine */ private void generateHTML(Parser p, RenderingContext context) { + ContentReader contentReader = context.makeContentReader(); + // Setup things to parse with - Metadata metadata = new Metadata(); StringWriter sw = new StringWriter(); - ContentHandler handler = buildContentHandler(sw); + ContentHandler handler = new TikaImageRewritingContentHandler( + buildContentHandler(sw), + getImagesDirectoryName(context) + ); + + // Tell Tika what we're dealing with + Metadata metadata = new Metadata(); + metadata.set( + Metadata.CONTENT_TYPE, + contentReader.getMimetype() + ); + metadata.set( + Metadata.RESOURCE_NAME_KEY, + nodeService.getProperty( + context.getSourceNode(), + ContentModel.PROP_NAME + ).toString() + ); // Our parse context needs to extract images ParseContext parseContext = new ParseContext(); @@ -249,7 +284,7 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine // Parse try { p.parse( - context.makeContentReader().getContentInputStream(), + contentReader.getContentInputStream(), handler, metadata, parseContext ); } catch(Exception e) { @@ -347,4 +382,95 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine createEmbeddedImage(imgFolder, (count==1), filename, type, stream, renderingContext); } } + + /** + * A content handler that re-writes image src attributes, + * and passes everything else on to the real one. + */ + private class TikaImageRewritingContentHandler implements ContentHandler { + private ContentHandler handler; + private String imageFolder; + + private TikaImageRewritingContentHandler(ContentHandler handler, String imageFolder) { + this.handler = handler; + this.imageFolder = imageFolder; + } + + @Override + public void startElement(String uri, String localName, String qName, + Attributes origAttrs) throws SAXException { + // If we have an image tag, re-write the src attribute + // if required + if("img".equals(localName)) { + AttributesImpl attrs; + if(origAttrs instanceof AttributesImpl) { + attrs = (AttributesImpl)origAttrs; + } else { + attrs = new AttributesImpl(origAttrs); + } + + for(int i=0; i