From cb7e60f7a7e13ef5eff099459061bd4da09c748f Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Mon, 13 Sep 2010 14:24:32 +0000 Subject: [PATCH] Add support for real image extraction to the html document rendering engine, and unit tests git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22459 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- .../executer/HTMLRenderingEngine.java | 200 +++++++++++++----- .../executer/HTMLRenderingEngineTest.java | 100 ++++++++- 2 files changed, 241 insertions(+), 59 deletions(-) diff --git a/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngine.java b/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngine.java index ea207ee617..1c3519ed85 100644 --- a/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngine.java +++ b/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngine.java @@ -19,11 +19,15 @@ package org.alfresco.repo.rendition.executer; +import java.io.IOException; +import java.io.InputStream; import java.io.Serializable; import java.io.StringWriter; import java.io.Writer; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; +import java.util.Set; import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerConfigurationException; @@ -42,12 +46,14 @@ import org.alfresco.service.cmr.repository.NodeRef; import org.alfresco.service.namespace.QName; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * This class provides a way to turn documents supported by the @@ -106,62 +112,8 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine } // Make the HTML Version using Tika + // This will also extract out any images as found generateHTML(p, context); - - // Extract out any images - // TODO - boolean hasImages = true; // TODO - if(hasImages) - { - Map properties = new HashMap(); - NodeRef imgFolder = null; - - // Extract into it - boolean donePrimary = false; - for(String fakeContent : new String[] {"Test1","Test2"}) - { - if(imgFolder == null) - imgFolder = createImagesDirectory(context); - - // Create the node if needed - NodeRef img = nodeService.getChildByName( - imgFolder, ContentModel.ASSOC_CONTAINS, fakeContent - ); - if(img == null) - { - properties.clear(); - properties.put(ContentModel.PROP_NAME, fakeContent); - img = nodeService.createNode( - imgFolder, - ContentModel.ASSOC_CONTAINS, - QName.createQName(fakeContent), - ContentModel.TYPE_CONTENT, - properties - ).getChildRef(); - } - - // If we can, associate it with the rendered HTML, so - // that they're properly linked - QName assocType = SECONDARY_IMAGE; - if(!donePrimary) - { - assocType = PRIMARY_IMAGE; - donePrimary = true; - } - if(dictionaryService.getAssociation(assocType) != null) - { - nodeService.createAssociation( - context.getDestinationNode(), img, assocType - ); - } - - // Put the image into the node - ContentWriter writer = contentService.getWriter( - img, ContentModel.PROP_CONTENT, true - ); - writer.putContent(fakeContent); - } - } } /** @@ -211,6 +163,52 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine return imgFolder; } + private NodeRef createEmbeddedImage(NodeRef imgFolder, boolean primary, + String filename, String contentType, InputStream imageSource, + RenderingContext context) + { + // Create the node if needed + NodeRef img = nodeService.getChildByName( + imgFolder, ContentModel.ASSOC_CONTAINS, filename + ); + if(img == null) + { + Map properties = new HashMap(); + properties.put(ContentModel.PROP_NAME, filename); + img = nodeService.createNode( + imgFolder, + ContentModel.ASSOC_CONTAINS, + QName.createQName(filename), + ContentModel.TYPE_CONTENT, + properties + ).getChildRef(); + } + + // If we can, associate it with the rendered HTML, so + // that they're properly linked + QName assocType = SECONDARY_IMAGE; + if(primary) + { + assocType = PRIMARY_IMAGE; + } + if(dictionaryService.getAssociation(assocType) != null) + { + nodeService.createAssociation( + context.getDestinationNode(), img, assocType + ); + } + + // Put the image into the node + ContentWriter writer = contentService.getWriter( + img, ContentModel.PROP_CONTENT, true + ); + writer.setMimetype(contentType); + writer.putContent(imageSource); + + // All done + return img; + } + /** * Builds a Tika-compatible SAX content handler, which will * be used to generate+capture the XHTML @@ -241,10 +239,13 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine { // Setup things to parse with Metadata metadata = new Metadata(); - ParseContext parseContext = new ParseContext(); StringWriter sw = new StringWriter(); ContentHandler handler = buildContentHandler(sw); + // Our parse context needs to extract images + ParseContext parseContext = new ParseContext(); + parseContext.set(Parser.class, new TikaImageExtractingParser(context)); + // Parse try { p.parse( @@ -259,4 +260,91 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine ContentWriter contentWriter = context.makeContentWriter(); contentWriter.putContent( sw.toString() ); } + + + /** + * A nested Tika parser which extracts out any + * images as they come past. + */ + @SuppressWarnings("serial") + private class TikaImageExtractingParser implements Parser { + private Set types; + + private RenderingContext renderingContext; + private NodeRef imgFolder = null; + private int count = 0; + + private TikaImageExtractingParser(RenderingContext renderingContext) { + this.renderingContext = renderingContext; + + // Our expected types + types = new HashSet(); + types.add(MediaType.image("bmp")); + types.add(MediaType.image("gif")); + types.add(MediaType.image("jpg")); + types.add(MediaType.image("jpeg")); + types.add(MediaType.image("png")); + types.add(MediaType.image("tiff")); + } + + @Override + public Set getSupportedTypes(ParseContext context) { + return types; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + // Is it a supported image? + String filename = metadata.get(Metadata.RESOURCE_NAME_KEY); + String type = metadata.get(Metadata.CONTENT_TYPE); + boolean accept = false; + + if(type != null) { + for(MediaType mt : types) { + if(mt.toString().equals(type)) { + accept = true; + } + } + } + if(filename != null) { + for(MediaType mt : types) { + String ext = "." + mt.getSubtype(); + if(filename.endsWith(ext)) { + accept = true; + } + } + } + + if(!accept) + return; + + handleImage(stream, filename, type); + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata) throws IOException, SAXException, TikaException { + parse(stream, handler, metadata, new ParseContext()); + } + + private void handleImage(InputStream stream, String filename, String type) { + count++; + + // Do we already have the folder? If not, create it + if(imgFolder == null) { + imgFolder = createImagesDirectory(renderingContext); + } + + // Give it a sensible name if needed + if(filename == null) { + filename = "image-" + count + "."; + filename += type.substring(type.indexOf('/')+1); + } + + // Save the image + createEmbeddedImage(imgFolder, (count==1), filename, type, stream, renderingContext); + } + } } \ No newline at end of file diff --git a/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngineTest.java b/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngineTest.java index a3470bdf6c..7ff3f2efdf 100644 --- a/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngineTest.java +++ b/source/java/org/alfresco/repo/rendition/executer/HTMLRenderingEngineTest.java @@ -29,6 +29,7 @@ import org.alfresco.model.ContentModel; import org.alfresco.repo.content.transform.AbstractContentTransformerTest; import org.alfresco.repo.model.Repository; import org.alfresco.repo.security.authentication.AuthenticationUtil; +import org.alfresco.service.cmr.dictionary.DictionaryService; import org.alfresco.service.cmr.rendition.RenditionDefinition; import org.alfresco.service.cmr.rendition.RenditionService; import org.alfresco.service.cmr.repository.ChildAssociationRef; @@ -51,6 +52,7 @@ public class HTMLRenderingEngineTest extends BaseAlfrescoSpringTest { private final static Log log = LogFactory.getLog(HTMLRenderingEngineTest.class); private NodeRef companyHome; + private DictionaryService dictionaryService; private RenditionService renditionService; private Repository repositoryHelper; @@ -75,6 +77,7 @@ public class HTMLRenderingEngineTest extends BaseAlfrescoSpringTest this.contentService = (ContentService) this.applicationContext.getBean("ContentService"); this.renditionService = (RenditionService) this.applicationContext.getBean("RenditionService"); this.repositoryHelper = (Repository) this.applicationContext.getBean("repositoryHelper"); + this.dictionaryService = (DictionaryService) this.applicationContext.getBean("dictionaryService"); this.companyHome = repositoryHelper.getCompanyHome(); createTargetFolder(); @@ -241,8 +244,13 @@ public class HTMLRenderingEngineTest extends BaseAlfrescoSpringTest // Check we didn't get an image folder, only the html int numItems = nodeService.getChildAssocs(targetFolder).size(); - // TODO - Enable this when proper folder stuff is in place -// assertEquals(numItemsStart+1, numItems); + assertEquals(numItemsStart+1, numItems); + + // Check that the html lacks img tags + assertEquals( + "Unexpected img tag in html:\n" + html, + false, html.contains("