Fix/mnt 25089 html transformations with ootb aio create extra whitespace (#1079)

This commit is contained in:
KushalBanik
2025-06-03 13:23:33 +05:30
committed by GitHub
parent 0c534f1081
commit cb9d070c9c
14 changed files with 1496 additions and 1324 deletions

View File

@@ -40,6 +40,7 @@ The following externalized T-engines properties are available:
| FILE_STORE_URL | T-Engine Port. | http://localhost:8099/alfresco/api/-default-/private/sfs/versions/1/file |
| TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for async requests. | org.alfresco.transform.engine.misc.acs |
| MISC_PDFBOX_DEFAULT_FONT | Default font used by PdfBox | NotoSans-Regular |
| MISC_HTML_COLLAPSE | Html Collasping Option for HTML to TXT transformation | true |
## Libreoffice
| Property | Description | Default value |
@@ -99,3 +100,4 @@ The following externalized T-engines properties are available:
| IMAGEMAGICK_CODERS | Path to Imagemagick custom coders. | |
| IMAGEMAGICK_CONFIG | Path to Imagemagick custom config. | |
| MISC_PDFBOX_DEFAULT_FONT | Default font used by PdfBox | NotoSans-Regular |
| MISC_HTML_COLLAPSE | Html Collasping Option for HTML to TXT transformation explicitly for Misc Engine | true |

View File

@@ -27,3 +27,5 @@ transform:
misc:
pdfBox:
defaultFont: ${MISC_PDFBOX_DEFAULT_FONT:NotoSans-Regular}
htmlOptions:
collapseHtml: ${MISC_HTML_COLLAPSE:true}

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* Copyright (C) 2005 - 2025 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -26,9 +26,18 @@
*/
package org.alfresco.transform.aio;
import org.alfresco.transform.base.AbstractBaseTest;
import org.alfresco.transform.base.TransformController;
import org.alfresco.transform.config.TransformConfig;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.alfresco.transform.base.TransformControllerTest.getLogMessagesFor;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
import static org.alfresco.transform.common.RequestParamMap.*;
import java.nio.file.Files;
import java.util.StringJoiner;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
@@ -36,20 +45,12 @@ import org.springframework.http.ResponseEntity;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilder;
import java.nio.file.Files;
import java.util.StringJoiner;
import static org.alfresco.transform.base.TransformControllerTest.getLogMessagesFor;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
import static org.alfresco.transform.common.RequestParamMap.CONFIG_VERSION_DEFAULT;
import static org.alfresco.transform.common.RequestParamMap.CONFIG_VERSION_LATEST;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import org.alfresco.transform.base.AbstractBaseTest;
import org.alfresco.transform.base.TransformController;
import org.alfresco.transform.config.TransformConfig;
/**
* Test All-In-One.
* Test All-In-One
*/
public class AIOTest extends AbstractBaseTest
{
@@ -66,7 +67,7 @@ public class AIOTest extends AbstractBaseTest
expectedOptions = null;
expectedSourceSuffix = null;
sourceFileBytes = readTestFile(sourceExtension);
expectedTargetFileBytes = Files.readAllBytes(getTestFile("quick2." + targetExtension, true).toPath());
expectedTargetFileBytes = Files.readAllBytes(getTestFile("quick3." + targetExtension, true).toPath());
sourceFile = new MockMultipartFile("file", "quick." + sourceExtension, sourceMimetype, sourceFileBytes);
}
@@ -76,7 +77,8 @@ public class AIOTest extends AbstractBaseTest
{
return super.mockMvcRequest(url, sourceFile, params)
.param("targetMimetype", targetMimetype)
.param("sourceMimetype", sourceMimetype);
.param("sourceMimetype", sourceMimetype)
.param(HTML_COLLAPSE, "true");
}
@Test

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* Copyright (C) 2005 - 2025 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -26,12 +26,14 @@
*/
package org.alfresco.transform.aio;
import com.google.common.collect.ImmutableSet;
import org.alfresco.transform.tika.TikaTest;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.alfresco.transform.base.html.OptionsHelper.getOptionNames;
import static org.junit.jupiter.api.Assertions.assertEquals;
import com.google.common.collect.ImmutableSet;
import org.junit.jupiter.api.Test;
import org.alfresco.transform.tika.TikaTest;
/**
* Test Tika functionality in All-In-One.
@@ -73,8 +75,8 @@ public class AIOTikaTest extends TikaTest
"thumbnail",
"width",
"pdfFont",
"pdfFontSize"
),
"pdfFontSize",
"collapseHtml"),
getOptionNames(controller.transformConfig(0).getBody().getTransformOptions()));
}
}

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* Copyright (C) 2005 - 2025 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -26,21 +26,22 @@
*/
package org.alfresco.transform.misc;
import com.google.common.collect.ImmutableMap;
import org.alfresco.transform.base.TransformEngine;
import org.alfresco.transform.base.probes.ProbeTransform;
import org.alfresco.transform.config.reader.TransformConfigResourceReader;
import org.alfresco.transform.config.TransformConfig;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.util.Map;
import static org.alfresco.transform.base.logging.StandardMessages.COMMUNITY_LICENCE;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
import java.util.Map;
import com.google.common.collect.ImmutableMap;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.alfresco.transform.base.TransformEngine;
import org.alfresco.transform.base.probes.ProbeTransform;
import org.alfresco.transform.config.TransformConfig;
import org.alfresco.transform.config.reader.TransformConfigResourceReader;
@Component
public class MiscTransformEngine implements TransformEngine
{
@@ -74,6 +75,6 @@ public class MiscTransformEngine implements TransformEngine
public ProbeTransform getProbeTransform()
{
return new ProbeTransform("probe.html", MIMETYPE_HTML, MIMETYPE_TEXT_PLAIN, transformOptions,
119, 30, 150, 1024, 60 * 2 + 1, 60 * 2);
107, 30, 150, 1024, 60 * 2 + 1, 60 * 2);
}
}

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* Copyright (C) 2005 - 2025 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -26,14 +26,8 @@
*/
package org.alfresco.transform.misc.transformers;
import org.alfresco.transform.base.TransformManager;
import org.alfresco.transform.base.util.CustomTransformerFileAdaptor;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import static org.alfresco.transform.common.RequestParamMap.HTML_COLLAPSE;
import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
import java.io.BufferedWriter;
import java.io.File;
@@ -45,27 +39,30 @@ import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.Map;
import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import org.alfresco.transform.base.TransformManager;
import org.alfresco.transform.base.util.CustomTransformerFileAdaptor;
/**
* Content transformer which wraps the HTML Parser library for
* parsing HTML content.
* Content transformer which wraps the HTML Parser library for parsing HTML content.
*
* <p>
* This code is based on a class of the same name originally implemented in alfresco-repository.
* </p>
*
* <p>
* Since HTML Parser was updated from v1.6 to v2.1, META tags
* defining an encoding for the content via http-equiv=Content-Type
* will ONLY be respected if the encoding of the content item
* itself is set to ISO-8859-1.
* Since HTML Parser was updated from v1.6 to v2.1, META tags defining an encoding for the content via http-equiv=Content-Type will ONLY be respected if the encoding of the content item itself is set to ISO-8859-1.
* </p>
*
* <p>
* Tika Note - could be converted to use the Tika HTML parser,
* but we'd potentially need a custom text handler to replicate
* the current settings around links and non-breaking spaces.
* Tika Note - could be converted to use the Tika HTML parser, but we'd potentially need a custom text handler to replicate the current settings around links and non-breaking spaces.
* </p>
*
* @author Derek Hulley
@@ -80,6 +77,9 @@ public class HtmlParserContentTransformer implements CustomTransformerFileAdapto
private static final Logger logger = LoggerFactory.getLogger(
HtmlParserContentTransformer.class);
@Value("${transform.core.misc.htmlOptions.collapseHtml:true}")
private String collapseOptionDefault;
@Override
public String getTransformerName()
{
@@ -93,6 +93,23 @@ public class HtmlParserContentTransformer implements CustomTransformerFileAdapto
{
String sourceEncoding = transformOptions.get(SOURCE_ENCODING);
checkEncodingParameter(sourceEncoding, SOURCE_ENCODING);
boolean collapse;
var collapseOption = transformOptions.get(HTML_COLLAPSE);
// If the collapse option is set, use it, otherwise use the default value
if (collapseOption != null && (collapseOption.trim().equalsIgnoreCase("true") || collapseOption.trim().equalsIgnoreCase("false")))
{
collapse = Boolean.parseBoolean(collapseOption);
}
else
{
// Use the default value from the configuration
collapse = collapseOptionDefault == null || Boolean.parseBoolean(collapseOptionDefault);
if (logger.isDebugEnabled())
{
logger.debug("Using default html collapse option: " + collapseOptionDefault);
}
}
if (logger.isDebugEnabled())
{
@@ -101,7 +118,7 @@ public class HtmlParserContentTransformer implements CustomTransformerFileAdapto
// Create the extractor
EncodingAwareStringBean extractor = new EncodingAwareStringBean();
extractor.setCollapse(false);
extractor.setCollapse(collapse);
extractor.setLinks(false);
extractor.setReplaceNonBreakingSpaces(false);
extractor.setURL(sourceFile, sourceEncoding);
@@ -138,24 +155,19 @@ public class HtmlParserContentTransformer implements CustomTransformerFileAdapto
* This code is based on a class of the same name, originally implemented in alfresco-repository.
* </p>
*
* A version of {@link StringBean} which allows control of the
* encoding in the underlying HTML Parser.
* Unfortunately, StringBean doesn't allow easy over-riding of
* this, so we have to duplicate some code to control this.
* This allows us to correctly handle HTML files where the encoding
* is specified against the content property (rather than in the
* HTML Head Meta), see ALF-10466 for details.
* A version of {@link StringBean} which allows control of the encoding in the underlying HTML Parser. Unfortunately, StringBean doesn't allow easy over-riding of this, so we have to duplicate some code to control this. This allows us to correctly handle HTML files where the encoding is specified against the content property (rather than in the HTML Head Meta), see ALF-10466 for details.
*/
public static class EncodingAwareStringBean extends StringBean
{
private static final long serialVersionUID = -9033414360428669553L;
/**
* Sets the File to extract strings from, and the encoding
* it's in (if known to Alfresco)
* Sets the File to extract strings from, and the encoding it's in (if known to Alfresco)
*
* @param file The File that text should be fetched from.
* @param encoding The encoding of the input
* @param file
* The File that text should be fetched from.
* @param encoding
* The encoding of the input
*/
public void setURL(File file, String encoding)
{

View File

@@ -5,3 +5,5 @@ transform:
misc:
pdfBox:
defaultFont: ${MISC_PDFBOX_DEFAULT_FONT:NotoSans-Regular}
htmlOptions:
collapseHtml: ${MISC_HTML_COLLAPSE:true}

View File

@@ -1,5 +1,8 @@
{
"transformOptions": {
"htmlOptions": [
{"value": {"name": "collapseHtml"}}
],
"textToPdfOptions": [
{"value": {"name": "pageLimit"}},
{"value": {"name": "pdfFont"}},
@@ -24,8 +27,7 @@
"supportedSourceAndTargetList": [
{"sourceMediaType": "text/html", "targetMediaType": "text/plain"}
],
"transformOptions": [
]
"transformOptions": ["htmlOptions"]
},
{
"transformerName": "string",

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* Copyright (C) 2005 - 2025 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -26,7 +26,30 @@
*/
package org.alfresco.transform.misc;
import org.alfresco.transform.base.AbstractBaseTest;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.header;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IWORK_KEYNOTE;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IWORK_NUMBERS;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_PDF;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_RFC822;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
import static org.alfresco.transform.common.RequestParamMap.ENDPOINT_TRANSFORM;
import static org.alfresco.transform.common.RequestParamMap.HTML_COLLAPSE;
import static org.alfresco.transform.common.RequestParamMap.SOURCE_MIMETYPE;
import static org.alfresco.transform.common.RequestParamMap.TARGET_MIMETYPE;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.nio.file.Files;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.junit.jupiter.api.BeforeEach;
@@ -35,35 +58,15 @@ import org.springframework.mock.web.MockMultipartFile;
import org.springframework.test.web.servlet.MvcResult;
import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilder;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.nio.file.Files;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IWORK_KEYNOTE;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IWORK_NUMBERS;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_WORDPROCESSING;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_PDF;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_RFC822;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
import static org.alfresco.transform.common.RequestParamMap.ENDPOINT_TRANSFORM;
import static org.alfresco.transform.common.RequestParamMap.SOURCE_MIMETYPE;
import static org.alfresco.transform.common.RequestParamMap.TARGET_MIMETYPE;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.header;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
import org.alfresco.transform.base.AbstractBaseTest;
/**
* Test Misc. Includes calling the 3rd party libraries.
*/
public class MiscTest extends AbstractBaseTest
{
protected final String sourceEncoding = "UTF-8";
protected final String targetEncoding = "UTF-8";
protected static final String sourceEncoding = "UTF-8";
protected static final String targetEncoding = "UTF-8";
protected final String targetMimetype = MIMETYPE_TEXT_PLAIN;
@BeforeEach
@@ -75,7 +78,7 @@ public class MiscTest extends AbstractBaseTest
expectedOptions = null;
expectedSourceSuffix = null;
sourceFileBytes = readTestFile(sourceExtension);
expectedTargetFileBytes = Files.readAllBytes(getTestFile("quick2." + targetExtension, true).toPath());
expectedTargetFileBytes = Files.readAllBytes(getTestFile("quick3." + targetExtension, true).toPath());
sourceFile = new MockMultipartFile("file", "quick." + sourceExtension, sourceMimetype, sourceFileBytes);
}
@@ -85,7 +88,8 @@ public class MiscTest extends AbstractBaseTest
final MockHttpServletRequestBuilder builder = super.mockMvcRequest(url, sourceFile, params)
.param("sourceEncoding", sourceEncoding)
.param("targetMimetype", targetMimetype)
.param("sourceMimetype", sourceMimetype);
.param("sourceMimetype", sourceMimetype)
.param(HTML_COLLAPSE, "true");
// Only the 'string' transformer should have the targetEncoding.
if (!"message/rfc822".equals(sourceMimetype) && !"text/html".equals(sourceMimetype))
@@ -204,8 +208,7 @@ public class MiscTest extends AbstractBaseTest
@Test
public void testExtractMetadataRFC822() throws Exception
{
String expected =
"{"+
String expected = "{" +
"\"{http://www.alfresco.org/model/content/1.0}addressee\":\"Nevin Nollop <nevin.nollop@gmail.com>\"," +
"\"{http://www.alfresco.org/model/content/1.0}addressees\":\"Nevin Nollop <nevinn@alfresco.com>\"," +
"\"{http://www.alfresco.org/model/content/1.0}description\":\"The quick brown fox jumps over the lazy dog\"," +
@@ -240,16 +243,14 @@ public class MiscTest extends AbstractBaseTest
public void testExtractMetadataOptionRFC822() throws Exception
{
// {"messageSubject":["{http://www.alfresco.org/model/imap/1.0}messageSubject","{http://www.alfresco.org/model/content/1.0}subjectline","{http://www.alfresco.org/model/content/1.0}description","{http://www.alfresco.org/model/content/1.0}title"],"Thread-Index":["{http://www.alfresco.org/model/imap/1.0}threadIndex"],"messageTo":["{http://www.alfresco.org/model/imap/1.0}messageTo","{http://www.alfresco.org/model/content/1.0}addressee"],"messageSent":["{http://www.alfresco.org/model/content/1.0}sentdate","{http://www.alfresco.org/model/imap/1.0}dateSent"],"Message-ID":["{http://www.alfresco.org/model/imap/1.0}messageId"],"messageCc":["{http://www.alfresco.org/model/imap/1.0}messageCc","{http://www.alfresco.org/model/content/1.0}addressees"],"messageReceived":["{http://www.alfresco.org/model/imap/1.0}dateReceived"],"messageFrom":["{http://www.alfresco.org/model/imap/1.0}messageFrom","{http://www.alfresco.org/model/content/1.0}originator"]}
String extractMapping =
"{\"messageSubject\":[" +
String extractMapping = "{\"messageSubject\":[" +
"\"{http://www.alfresco.org/model/imap/1.0}messageSubject\"," +
"\"{http://www.alfresco.org/model/content/1.0}title\"]," +
"\"Thread-Index\":[" +
"\"{http://www.alfresco.org/model/imap/1.0}threadIndex\"]," +
"\"messageFrom\":[" +
"\"{http://www.alfresco.org/model/dod5015/1.0}dodProp1\"]}\n";
String expected =
"{\"{http://www.alfresco.org/model/content/1.0}title\":\"The quick brown fox jumps over the lazy dog\","+
String expected = "{\"{http://www.alfresco.org/model/content/1.0}title\":\"The quick brown fox jumps over the lazy dog\"," +
"\"{http://www.alfresco.org/model/dod5015/1.0}dodProp1\":\"Nevin Nollop <nevin.nollop@alfresco.com>\"," +
"\"{http://www.alfresco.org/model/imap/1.0}messageSubject\":\"The quick brown fox jumps over the lazy dog\"}";
MvcResult result = sendRequest("eml",
@@ -297,7 +298,7 @@ public class MiscTest extends AbstractBaseTest
"<p>" + TEXT_P2 + "</p>" + NEWLINE +
"<p>" + TEXT_P3 + "</p>" + NEWLINE;
String partC = "</body></html>";
final String expected = TITLE + NEWLINE + TEXT_P1 + NEWLINE + TEXT_P2 + NEWLINE + TEXT_P3 + NEWLINE;
final String expected = TITLE + " " + TEXT_P1 + " " + TEXT_P2 + " " + TEXT_P3;
MvcResult result = sendRequest("html",
"UTF-8",
@@ -428,13 +429,13 @@ public class MiscTest extends AbstractBaseTest
// @Test
// TODO Doesn't work with java 11, enable when fixed
public void testOOXML() throws Exception
{
MvcResult result = sendRequest("docx", null, MIMETYPE_OPENXML_WORDPROCESSING,
"jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("docx"));
assertTrue(result.getResponse().getContentAsByteArray().length > 0L,
"Expected image content but content is empty.");
}
// public void testOOXML() throws Exception
// {
// MvcResult result = sendRequest("docx", null, MIMETYPE_OPENXML_WORDPROCESSING,
// "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("docx"));
// assertTrue(result.getResponse().getContentAsByteArray().length > 0L,
// "Expected image content but content is empty.");
// }
private MvcResult sendRequest(String sourceExtension,
String sourceEncoding,
@@ -449,8 +450,7 @@ public class MiscTest extends AbstractBaseTest
final MockMultipartFile sourceFile = new MockMultipartFile("file",
"test_file." + sourceExtension, sourceMimetype, content);
final MockHttpServletRequestBuilder requestBuilder = super
.mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile)
final MockHttpServletRequestBuilder requestBuilder = super.mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile)
.param(TARGET_MIMETYPE, targetMimetype)
.param(SOURCE_MIMETYPE, sourceMimetype);

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* Copyright (C) 2005 - 2025 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -26,7 +26,11 @@
*/
package org.alfresco.transform.misc.transformers;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;
import static org.alfresco.transform.common.RequestParamMap.HTML_COLLAPSE;
import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
import java.io.File;
import java.io.FileOutputStream;
@@ -35,38 +39,35 @@ import java.nio.file.Files;
import java.util.HashMap;
import java.util.Map;
import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
public class HtmlParserContentTransformerTest
{
private static final String SOURCE_MIMETYPE = "text/html";
private static final String TARGET_MIMETYPE = "text/plain";
HtmlParserContentTransformer transformer = new HtmlParserContentTransformer();
/**
* Checks that we correctly handle text in different encodings,
* no matter if the encoding is specified on the Content Property
* or in a meta tag within the HTML itself. (ALF-10466)
* Checks that we correctly handle text in different encodings, no matter if the encoding is specified on the Content Property or in a meta tag within the HTML itself. (ALF-10466)
*
* On Windows, org.htmlparser.beans.StringBean.carriageReturn() appends a new system dependent new line
* so we must be careful when checking the returned text
* On Windows, org.htmlparser.beans.StringBean.carriageReturn() appends a new system dependent new line so we must be careful when checking the returned text
*/
@Test
public void testEncodingHandling() throws Exception
{
final String NEWLINE = System.getProperty("line.separator");
final String TITLE = "Testing!";
final String TEXT_P1 = "This is some text in English";
final String TEXT_P2 = "This is more text in English";
final String TEXT_P3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
String partA = "<html><head><title>" + TITLE + "</title></head>" + NEWLINE;
String partB = "<body><p>" + TEXT_P1 + "</p>" + NEWLINE +
"<p>" + TEXT_P2 + "</p>" + NEWLINE +
"<p>" + TEXT_P3 + "</p>" + NEWLINE;
final HtmlParserContentTransformer transformer = new HtmlParserContentTransformer();
final String newline = System.getProperty("line.separator");
final String title = "Testing!";
final String textp1 = "This is some text in English";
final String textp2 = "This is more text in English";
final String textp3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
String partA = "<html><head><title>" + title + "</title></head>" + newline;
String partB = "<body><p>" + textp1 + "</p>" + newline +
"<p>" + textp2 + "</p>" + newline +
"<p>" + textp3 + "</p>" + newline;
String partC = "</body></html>";
final String expected = TITLE + NEWLINE + TEXT_P1 + NEWLINE + TEXT_P2 + NEWLINE + TEXT_P3 + NEWLINE;
final String expected = title + newline + textp1 + newline + textp2 + newline + textp3;
File tmpS = null;
File tmpD = null;
@@ -81,6 +82,7 @@ public class HtmlParserContentTransformerTest
Map<String, String> parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "ISO-8859-1");
parameters.put(HTML_COLLAPSE, String.valueOf(true));
transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
@@ -94,6 +96,7 @@ public class HtmlParserContentTransformerTest
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "UTF-8");
parameters.put(HTML_COLLAPSE, String.valueOf(true));
transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
@@ -105,6 +108,7 @@ public class HtmlParserContentTransformerTest
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
parameters = new HashMap<>();
parameters.put(HTML_COLLAPSE, String.valueOf(true));
parameters.put(SOURCE_ENCODING, "UTF-16");
transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
@@ -131,6 +135,7 @@ public class HtmlParserContentTransformerTest
parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "ISO-8859-1");
parameters.put(HTML_COLLAPSE, String.valueOf(true));
transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
@@ -140,23 +145,156 @@ public class HtmlParserContentTransformerTest
// because without that the parser won't know about the
// 2 byte format so won't be able to identify the meta tag
}
catch (Exception e)
{
fail("Test Failed: " + e.getMessage()); // fail the test if any exception occurs
}
finally
{
if (tmpS != null && tmpS.exists()) tmpS.delete();
if (tmpD != null && tmpD.exists()) tmpD.delete();
if (tmpS != null && tmpS.exists())
{
tmpS.delete();
}
if (tmpD != null && tmpD.exists())
{
tmpD.delete();
}
}
}
private void writeToFile(File file, String content, String encoding) throws Exception
/**
* Tests the transformer with different collapsing methods. If the collapsing is set to false, it should not collapse the new lines between paragraphs. If the collapsing is set to true, it should collapse the new lines.
*/
@ParameterizedTest
@ValueSource(booleans = {true, false})
public void testTransformerWithDifferentCollapsingMethods(boolean shouldCollapse)
{
final HtmlParserContentTransformer transformer = new HtmlParserContentTransformer();
final String newline = System.getProperty("line.separator");
final String title = "Testing!";
final String textp1 = "This is some text in English";
final String textp2 = "This is more text in English";
final String textp3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
String partA = "<html><head><title>" + title + "</title></head>" + newline;
String partB = "<body><p>" + textp1 + "</p>" + newline +
"<p>" + textp2 + "</p>" + newline +
"<p>" + textp3 + "</p>" + newline;
String partC = "</body></html>";
final String expected = title + newline + textp1 + newline + textp2 + newline + textp3 + (shouldCollapse ? "" : newline); // Just a added newline if collapsing is not collapsing
File tmpS = null;
File tmpD = null;
try
{
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
writeToFile(tmpS, partA + partB + partC, "UTF-8");
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
Map<String, String> parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "UTF-8");
parameters.put(HTML_COLLAPSE, String.valueOf(shouldCollapse));
transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
tmpD.delete();
}
catch (Exception e)
{
fail("Test Failed: " + e.getMessage()); // fail the test if any exception occurs
}
finally
{
if (tmpS != null && tmpS.exists())
{
tmpS.delete();
}
if (tmpD != null && tmpD.exists())
{
tmpD.delete();
}
}
}
/**
* Tests the transformer with wrong boolean values for the collapse option. It should not throw an exception and should use the default value for collapsing.
*/
@ParameterizedTest
@ValueSource(strings = {"cat", "dog", "", "1234abcd", "@#$%"})
public void testTransformerWithWrongBooleanValues(String booleanValues)
{
final HtmlParserContentTransformer transformer = new HtmlParserContentTransformer();
final String newline = System.getProperty("line.separator");
final String title = "Testing!";
final String textp1 = "This is some text in English";
final String textp2 = "This is more text in English";
final String textp3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
String partA = "<html><head><title>" + title + "</title></head>" + newline;
String partB = "<body><p>" + textp1 + "</p>" + newline +
"<p>" + textp2 + "</p>" + newline +
"<p>" + textp3 + "</p>" + newline;
String partC = "</body></html>";
final String expected = title + newline + textp1 + newline + textp2 + newline + textp3;
File tmpS = null;
File tmpD = null;
try
{
tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
writeToFile(tmpS, partA + partB + partC, "UTF-8");
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
Map<String, String> parameters = new HashMap<>();
parameters.put(SOURCE_ENCODING, "UTF-8");
parameters.put(HTML_COLLAPSE, booleanValues);
transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
tmpS.delete();
tmpD.delete();
}
catch (Exception e)
{
fail("Test Failed: " + e.getMessage()); // fail the test if any exception occurs
}
finally
{
if (tmpS != null && tmpS.exists())
{
tmpS.delete();
}
if (tmpD != null && tmpD.exists())
{
tmpD.delete();
}
}
}
private void writeToFile(File file, String content, String encoding)
{
try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
{
ow.append(content);
}
catch (Exception e)
{
fail("Failed to write to file: " + e.getMessage()); // fail the test if any exception occurs
}
}
private String readFromFile(File file, final String encoding) throws Exception
private String readFromFile(File file, final String encoding)
{
try
{
return new String(Files.readAllBytes(file.toPath()), encoding);
}
catch (Exception e)
{
fail("Failed to read from file: " + e.getMessage());
return null; // Return null if there is an error reading the file
}
}
}

View File

@@ -1,5 +1,8 @@
{
"transformOptions": {
"htmlOptions": [
{"value": {"name": "collapseHtml"}}
],
"textToPdfOptions": [
{"value": {"name": "pageLimit"}}
],
@@ -17,6 +20,7 @@
{"sourceMediaType": "text/html", "targetMediaType": "text/plain"}
],
"transformOptions": [
"htmlOptions"
]
},
{

View File

@@ -0,0 +1,2 @@
The quick brown fox jumps over the lazy dog
The quick brown fox jumps over the lazy dog

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* Copyright (C) 2005 - 2025 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -26,31 +26,21 @@
*/
package org.alfresco.transform.tika;
import com.google.common.collect.ImmutableSet;
import org.alfresco.transform.base.AbstractBaseTest;
import org.alfresco.transform.base.executors.RuntimeExec;
import org.alfresco.transform.base.model.FileRefEntity;
import org.alfresco.transform.base.model.FileRefResponse;
import org.alfresco.transform.client.model.TransformReply;
import org.alfresco.transform.client.model.TransformRequest;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mock;
import org.springframework.core.io.FileSystemResource;
import org.springframework.core.io.Resource;
import org.springframework.http.HttpHeaders;
import org.springframework.http.ResponseEntity;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.test.web.servlet.MvcResult;
import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilder;
import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.UUID;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.when;
import static org.springframework.http.HttpHeaders.ACCEPT;
import static org.springframework.http.HttpHeaders.CONTENT_DISPOSITION;
import static org.springframework.http.HttpHeaders.CONTENT_TYPE;
import static org.springframework.http.HttpStatus.CREATED;
import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
import static org.springframework.http.HttpStatus.OK;
import static org.springframework.http.MediaType.APPLICATION_JSON_VALUE;
import static org.springframework.http.MediaType.APPLICATION_PDF_VALUE;
import static org.springframework.http.MediaType.TEXT_PLAIN_VALUE;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.header;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
import static org.alfresco.transform.base.html.OptionsHelper.getOptionNames;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
@@ -89,21 +79,33 @@ import static org.alfresco.transform.tika.transformers.Tika.XHTML;
import static org.alfresco.transform.tika.transformers.Tika.XLSX;
import static org.alfresco.transform.tika.transformers.Tika.XML;
import static org.alfresco.transform.tika.transformers.Tika.ZIP;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.when;
import static org.springframework.http.HttpHeaders.ACCEPT;
import static org.springframework.http.HttpHeaders.CONTENT_DISPOSITION;
import static org.springframework.http.HttpHeaders.CONTENT_TYPE;
import static org.springframework.http.HttpStatus.CREATED;
import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
import static org.springframework.http.HttpStatus.OK;
import static org.springframework.http.MediaType.APPLICATION_JSON_VALUE;
import static org.springframework.http.MediaType.APPLICATION_PDF_VALUE;
import static org.springframework.http.MediaType.TEXT_PLAIN_VALUE;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.header;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.UUID;
import com.google.common.collect.ImmutableSet;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mock;
import org.springframework.core.io.FileSystemResource;
import org.springframework.core.io.Resource;
import org.springframework.http.HttpHeaders;
import org.springframework.http.ResponseEntity;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.test.web.servlet.MvcResult;
import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilder;
import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
import org.alfresco.transform.base.AbstractBaseTest;
import org.alfresco.transform.base.executors.RuntimeExec;
import org.alfresco.transform.base.model.FileRefEntity;
import org.alfresco.transform.base.model.FileRefResponse;
import org.alfresco.transform.client.model.TransformReply;
import org.alfresco.transform.client.model.TransformRequest;
/**
* Test Tika.
@@ -377,14 +379,12 @@ public class TikaTest extends AbstractBaseTest
{
mockTransformCommand(XLSX, XLSX, MIMETYPE_OPENXML_SPREADSHEET, false);
String metadata =
"{\"{http://www.alfresco.org/model/content/1.0}author\":\"author1\"," +
String metadata = "{\"{http://www.alfresco.org/model/content/1.0}author\":\"author1\"," +
"\"{http://www.alfresco.org/model/content/1.0}title\":\"title1\"," +
"\"{http://www.alfresco.org/model/content/1.0}description\":[\"desc1\",\"desc2\"]," +
"\"{http://www.alfresco.org/model/content/1.0}created\":\"created1\"}";
MockHttpServletRequestBuilder requestBuilder =
super.mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile,
MockHttpServletRequestBuilder requestBuilder = super.mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile,
"targetExtension", XLSX,
"metadata", metadata,
"targetMimetype", MIMETYPE_METADATA_EMBED,
@@ -393,8 +393,8 @@ public class TikaTest extends AbstractBaseTest
MvcResult result = mockMvc.perform(requestBuilder)
.andExpect(status().is(OK.value()))
.andExpect(header().string("Content-Disposition",
"attachment; filename*=UTF-8''transform." + targetExtension)).
andReturn();
"attachment; filename*=UTF-8''transform." + targetExtension))
.andReturn();
byte[] bytes = result.getResponse().getContentAsByteArray();
ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Model
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* Copyright (C) 2005 - 2025 Alfresco Software Limited
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
@@ -69,6 +69,9 @@ public interface RequestParamMap
String PDF_FONT = "pdfFont";
String PDF_FONT_SIZE = "pdfFontSize";
// Html parameter names for the transform config
String HTML_COLLAPSE = "collapseHtml";
// Parameters interpreted by the TransformController
String DIRECT_ACCESS_URL = "directAccessUrl";