package org.alfresco.repo.rendition.executer; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.io.StringWriter; import java.io.Writer; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import org.alfresco.model.ContentModel; import org.alfresco.repo.action.ParameterDefinitionImpl; import org.alfresco.repo.rendition.RenditionLocation; import org.alfresco.service.cmr.action.ParameterDefinition; import org.alfresco.service.cmr.dictionary.DataTypeDefinition; import org.alfresco.service.cmr.rendition.RenditionServiceException; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentService; import org.alfresco.service.cmr.repository.ContentWriter; import org.alfresco.service.cmr.repository.NodeRef; import org.alfresco.service.namespace.QName; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; /** * This class provides a way to turn documents supported by the * {@link ContentService} standard transformers into basic, clean * HTML. *
* The HTML that is produced probably isn't going to be suitable * for direct web publishing, as it's likely going to be too * basic. Instead, it should be simple and clean HTML, suitable * for being the basis of some web-friendly HTML once edited * / further transformed. * * @author Nick Burch * @since 3.4 */ public class HTMLRenderingEngine extends AbstractRenderingEngine { private static Log logger = LogFactory.getLog(HTMLRenderingEngine.class); private TikaConfig tikaConfig; /** * This optional parameter, when set to true, causes only the * contents of the HTML body to be written out as the rendition. * By default, the whole of the HTML document is used. */ public static final String PARAM_BODY_CONTENTS_ONLY = "bodyContentsOnly"; /** * This optional parameter, when set to true, causes any embedded * images to be written into the same folder as the html, with * a name prefix. * By default, images are placed into a sub-folder. */ public static final String PARAM_IMAGES_SAME_FOLDER = "imagesSameFolder"; /* * Action constants */ public static final String NAME = "htmlRenderingEngine"; @Override protected Collection types;
private RenderingContext renderingContext;
private NodeRef imgFolder = null;
private int count = 0;
private TikaImageExtractingParser(RenderingContext renderingContext) {
this.renderingContext = renderingContext;
// Our expected types
types = new HashSet