ALF-16841: Converting files without a title to HTML generates invalid HTML

- Added unit test which confirms the issue
   - Wrapped the html content handler in Tika's new ExpandedTitleContentHandler


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@44819 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Ray Gauss 2012-12-18 19:13:57 +00:00
parent 80344e8cdf
commit ff9a4ba6cc
2 changed files with 6 additions and 0 deletions

View File

@ -41,6 +41,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.xml.sax.ContentHandler;
/**
@ -135,6 +136,7 @@ public abstract class TikaPoweredContentTransformer extends AbstractContentTrans
if(MimetypeMap.MIMETYPE_HTML.equals(targetMimeType))
{
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
return new ExpandedTitleContentHandler(handler);
}
else if(MimetypeMap.MIMETYPE_XHTML.equals(targetMimeType) ||
MimetypeMap.MIMETYPE_XML.equals(targetMimeType))

View File

@ -80,6 +80,10 @@ public abstract class TikaPoweredContentTransformerTest extends AbstractContentT
"HTML footer not found",
contents.contains("</html>")
);
assertTrue(
"Expanded HTML title not found",
contents.contains("</title>")
);
}
else if(targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN))
{