Add support for real image extraction to the html document rendering engine, and unit tests

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@22459 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-09-13 14:24:32 +00:00
parent b20bac870c
commit cb7e60f7a7
2 changed files with 241 additions and 59 deletions

View File

@@ -19,11 +19,15 @@
package org.alfresco.repo.rendition.executer;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.io.StringWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
@@ -42,12 +46,14 @@ import org.alfresco.service.cmr.repository.NodeRef;
import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* This class provides a way to turn documents supported by the
@@ -106,62 +112,8 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
}
// Make the HTML Version using Tika
// This will also extract out any images as found
generateHTML(p, context);
// Extract out any images
// TODO
boolean hasImages = true; // TODO
if(hasImages)
{
Map<QName,Serializable> properties = new HashMap<QName,Serializable>();
NodeRef imgFolder = null;
// Extract into it
boolean donePrimary = false;
for(String fakeContent : new String[] {"Test1","Test2"})
{
if(imgFolder == null)
imgFolder = createImagesDirectory(context);
// Create the node if needed
NodeRef img = nodeService.getChildByName(
imgFolder, ContentModel.ASSOC_CONTAINS, fakeContent
);
if(img == null)
{
properties.clear();
properties.put(ContentModel.PROP_NAME, fakeContent);
img = nodeService.createNode(
imgFolder,
ContentModel.ASSOC_CONTAINS,
QName.createQName(fakeContent),
ContentModel.TYPE_CONTENT,
properties
).getChildRef();
}
// If we can, associate it with the rendered HTML, so
// that they're properly linked
QName assocType = SECONDARY_IMAGE;
if(!donePrimary)
{
assocType = PRIMARY_IMAGE;
donePrimary = true;
}
if(dictionaryService.getAssociation(assocType) != null)
{
nodeService.createAssociation(
context.getDestinationNode(), img, assocType
);
}
// Put the image into the node
ContentWriter writer = contentService.getWriter(
img, ContentModel.PROP_CONTENT, true
);
writer.putContent(fakeContent);
}
}
}
/**
@@ -211,6 +163,52 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
return imgFolder;
}
private NodeRef createEmbeddedImage(NodeRef imgFolder, boolean primary,
String filename, String contentType, InputStream imageSource,
RenderingContext context)
{
// Create the node if needed
NodeRef img = nodeService.getChildByName(
imgFolder, ContentModel.ASSOC_CONTAINS, filename
);
if(img == null)
{
Map<QName,Serializable> properties = new HashMap<QName,Serializable>();
properties.put(ContentModel.PROP_NAME, filename);
img = nodeService.createNode(
imgFolder,
ContentModel.ASSOC_CONTAINS,
QName.createQName(filename),
ContentModel.TYPE_CONTENT,
properties
).getChildRef();
}
// If we can, associate it with the rendered HTML, so
// that they're properly linked
QName assocType = SECONDARY_IMAGE;
if(primary)
{
assocType = PRIMARY_IMAGE;
}
if(dictionaryService.getAssociation(assocType) != null)
{
nodeService.createAssociation(
context.getDestinationNode(), img, assocType
);
}
// Put the image into the node
ContentWriter writer = contentService.getWriter(
img, ContentModel.PROP_CONTENT, true
);
writer.setMimetype(contentType);
writer.putContent(imageSource);
// All done
return img;
}
/**
* Builds a Tika-compatible SAX content handler, which will
* be used to generate+capture the XHTML
@@ -241,10 +239,13 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
{
// Setup things to parse with
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
StringWriter sw = new StringWriter();
ContentHandler handler = buildContentHandler(sw);
// Our parse context needs to extract images
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, new TikaImageExtractingParser(context));
// Parse
try {
p.parse(
@@ -259,4 +260,91 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
ContentWriter contentWriter = context.makeContentWriter();
contentWriter.putContent( sw.toString() );
}
/**
* A nested Tika parser which extracts out any
* images as they come past.
*/
@SuppressWarnings("serial")
private class TikaImageExtractingParser implements Parser {
private Set<MediaType> types;
private RenderingContext renderingContext;
private NodeRef imgFolder = null;
private int count = 0;
private TikaImageExtractingParser(RenderingContext renderingContext) {
this.renderingContext = renderingContext;
// Our expected types
types = new HashSet<MediaType>();
types.add(MediaType.image("bmp"));
types.add(MediaType.image("gif"));
types.add(MediaType.image("jpg"));
types.add(MediaType.image("jpeg"));
types.add(MediaType.image("png"));
types.add(MediaType.image("tiff"));
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return types;
}
@Override
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
// Is it a supported image?
String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
String type = metadata.get(Metadata.CONTENT_TYPE);
boolean accept = false;
if(type != null) {
for(MediaType mt : types) {
if(mt.toString().equals(type)) {
accept = true;
}
}
}
if(filename != null) {
for(MediaType mt : types) {
String ext = "." + mt.getSubtype();
if(filename.endsWith(ext)) {
accept = true;
}
}
}
if(!accept)
return;
handleImage(stream, filename, type);
}
@Override
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata) throws IOException, SAXException, TikaException {
parse(stream, handler, metadata, new ParseContext());
}
private void handleImage(InputStream stream, String filename, String type) {
count++;
// Do we already have the folder? If not, create it
if(imgFolder == null) {
imgFolder = createImagesDirectory(renderingContext);
}
// Give it a sensible name if needed
if(filename == null) {
filename = "image-" + count + ".";
filename += type.substring(type.indexOf('/')+1);
}
// Save the image
createEmbeddedImage(imgFolder, (count==1), filename, type, stream, renderingContext);
}
}
}

View File

@@ -29,6 +29,7 @@ import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
import org.alfresco.repo.model.Repository;
import org.alfresco.repo.security.authentication.AuthenticationUtil;
import org.alfresco.service.cmr.dictionary.DictionaryService;
import org.alfresco.service.cmr.rendition.RenditionDefinition;
import org.alfresco.service.cmr.rendition.RenditionService;
import org.alfresco.service.cmr.repository.ChildAssociationRef;
@@ -51,6 +52,7 @@ public class HTMLRenderingEngineTest extends BaseAlfrescoSpringTest
{
private final static Log log = LogFactory.getLog(HTMLRenderingEngineTest.class);
private NodeRef companyHome;
private DictionaryService dictionaryService;
private RenditionService renditionService;
private Repository repositoryHelper;
@@ -75,6 +77,7 @@ public class HTMLRenderingEngineTest extends BaseAlfrescoSpringTest
this.contentService = (ContentService) this.applicationContext.getBean("ContentService");
this.renditionService = (RenditionService) this.applicationContext.getBean("RenditionService");
this.repositoryHelper = (Repository) this.applicationContext.getBean("repositoryHelper");
this.dictionaryService = (DictionaryService) this.applicationContext.getBean("dictionaryService");
this.companyHome = repositoryHelper.getCompanyHome();
createTargetFolder();
@@ -241,8 +244,13 @@ public class HTMLRenderingEngineTest extends BaseAlfrescoSpringTest
// Check we didn't get an image folder, only the html
int numItems = nodeService.getChildAssocs(targetFolder).size();
// TODO - Enable this when proper folder stuff is in place
// assertEquals(numItemsStart+1, numItems);
assertEquals(numItemsStart+1, numItems);
// Check that the html lacks img tags
assertEquals(
"Unexpected img tag in html:\n" + html,
false, html.contains("<img")
);
// Check we didn't get any images
for(ChildAssociationRef ref : nodeService.getChildAssocs(htmlNode))
@@ -264,7 +272,93 @@ public class HTMLRenderingEngineTest extends BaseAlfrescoSpringTest
*/
public void testDocWithOneImages() throws Exception
{
RenditionDefinition def = renditionService.createRenditionDefinition(
QName.createQName("Test"), HTMLRenderingEngine.NAME);
def.setParameterValue(
RenditionService.PARAM_DESTINATION_PATH_TEMPLATE,
targetFolderPath + "/${name}.html"
);
for(String name : new String[] {"quickImg1.doc","quickImg1.docx"})
{
sourceDoc = createForDoc(name);
String baseName = name.substring(0, name.lastIndexOf('.'));
int numItemsStart = nodeService.getChildAssocs(targetFolder).size();
ChildAssociationRef rendition = renditionService.render(sourceDoc, def);
assertNotNull(rendition);
// Check it was created
NodeRef htmlNode = rendition.getChildRef();
assertEquals(true, nodeService.exists(htmlNode));
// Check it got the right name
assertEquals(
baseName + ".html",
nodeService.getProperty(htmlNode, ContentModel.PROP_NAME)
);
// Check it ended up in the right place
assertEquals(
"Should have been in " + targetFolderPath + " but was in" +
nodeService.getPath(htmlNode),
targetFolder,
nodeService.getPrimaryParent(htmlNode).getParentRef()
);
// Check it got the right contents
ContentReader reader = contentService.getReader(
htmlNode, ContentModel.PROP_CONTENT
);
String html = reader.getContentString();
assertEquals("<?xml", html.substring(0, 5));
// Check that the html has the img tags
// assertEquals(
// "Couldn't find img tag in html:\n" + html,
// true, html.contains("<img")
// );
// Check we got an image folder
int numItems = nodeService.getChildAssocs(targetFolder).size();
assertEquals(numItemsStart+2, numItems);
// Check the name of the image folder
NodeRef imgFolder = null;
for(ChildAssociationRef ref : nodeService.getChildAssocs(targetFolder)) {
if(nodeService.getProperty(ref.getChildRef(), ContentModel.PROP_NAME).equals(
baseName + "_files"
)) {
imgFolder = ref.getChildRef();
}
}
assertNotNull("Couldn't find new folder named " + baseName + "_files", imgFolder);
// Check the contents
assertEquals(1, nodeService.getChildAssocs(imgFolder).size());
// Check the associations if supported
if(dictionaryService.getAssociation(HTMLRenderingEngine.PRIMARY_IMAGE) != null)
{
boolean hasPrimary = false;
boolean hasSecondary = false;
for(ChildAssociationRef ref : nodeService.getChildAssocs(htmlNode))
{
if(ref.getTypeQName().equals(HTMLRenderingEngine.PRIMARY_IMAGE))
hasPrimary = true;
if(ref.getTypeQName().equals(HTMLRenderingEngine.SECONDARY_IMAGE))
hasSecondary = true;
}
assertEquals(true, hasPrimary);
assertEquals(false, hasSecondary);
}
// All done
tidyUpSourceDoc();
}
}
/**