Fix/mnt 25089 html transformations with ootb aio create extra whitespace (#1079)

2025-07-31 17:38:33 +00:00 · 2025-06-03 13:23:33 +05:30
parent 0c534f1081
commit cb9d070c9c
14 changed files with 1496 additions and 1324 deletions
--- a/engines/misc/src/main/java/org/alfresco/transform/misc/MiscTransformEngine.java
+++ b/engines/misc/src/main/java/org/alfresco/transform/misc/MiscTransformEngine.java
@@ -2,7 +2,7 @@
 * #%L
 * Alfresco Transform Core
 * %%
- * Copyright (C) 2005 - 2022 Alfresco Software Limited
+ * Copyright (C) 2005 - 2025 Alfresco Software Limited
 * %%
 * This file is part of the Alfresco software.
 * -
@@ -26,21 +26,22 @@
 */
 package org.alfresco.transform.misc;

-import com.google.common.collect.ImmutableMap;
-import org.alfresco.transform.base.TransformEngine;
-import org.alfresco.transform.base.probes.ProbeTransform;
-import org.alfresco.transform.config.reader.TransformConfigResourceReader;
-import org.alfresco.transform.config.TransformConfig;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.stereotype.Component;
-
-import java.util.Map;
-
 import static org.alfresco.transform.base.logging.StandardMessages.COMMUNITY_LICENCE;
 import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
 import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
 import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;

+import java.util.Map;
+
+import com.google.common.collect.ImmutableMap;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+
+import org.alfresco.transform.base.TransformEngine;
+import org.alfresco.transform.base.probes.ProbeTransform;
+import org.alfresco.transform.config.TransformConfig;
+import org.alfresco.transform.config.reader.TransformConfigResourceReader;
+
@Component
 public class MiscTransformEngine implements TransformEngine
 {
@@ -74,6 +75,6 @@ public class MiscTransformEngine implements TransformEngine
    public ProbeTransform getProbeTransform()
    {
        return new ProbeTransform("probe.html", MIMETYPE_HTML, MIMETYPE_TEXT_PLAIN, transformOptions,
-                119, 30, 150, 1024, 60 * 2 + 1, 60 * 2);
+                107, 30, 150, 1024, 60 * 2 + 1, 60 * 2);
    }
 }
--- a/engines/misc/src/main/java/org/alfresco/transform/misc/transformers/HtmlParserContentTransformer.java
+++ b/engines/misc/src/main/java/org/alfresco/transform/misc/transformers/HtmlParserContentTransformer.java
@@ -1,203 +1,215 @@
-/*
- * #%L
- * Alfresco Transform Core
- * %%
- * Copyright (C) 2005 - 2022 Alfresco Software Limited
- * %%
- * This file is part of the Alfresco software.
- * -
- * If the software was purchased under a paid Alfresco license, the terms of
- * the paid license agreement will prevail.  Otherwise, the software is
- * provided under the following open source license terms:
- * -
- * Alfresco is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * -
- * Alfresco is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- * -
- * You should have received a copy of the GNU Lesser General Public License
- * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
- * #L%
- */
-package org.alfresco.transform.misc.transformers;
-
-import org.alfresco.transform.base.TransformManager;
-import org.alfresco.transform.base.util.CustomTransformerFileAdaptor;
-import org.htmlparser.Parser;
-import org.htmlparser.beans.StringBean;
-import org.htmlparser.util.ParserException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.springframework.stereotype.Component;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.net.URLConnection;
-import java.nio.charset.Charset;
-import java.nio.charset.IllegalCharsetNameException;
-import java.util.Map;
-
-import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
-
-/**
- * Content transformer which wraps the HTML Parser library for
- * parsing HTML content.
- *
- * <p>
- * This code is based on a class of the same name originally implemented in alfresco-repository.
- * </p>
- *
- * <p>
- * Since HTML Parser was updated from v1.6 to v2.1, META tags
- * defining an encoding for the content via http-equiv=Content-Type
- * will ONLY be respected if the encoding of the content item
- * itself is set to ISO-8859-1.
- * </p>
- *
- * <p>
- * Tika Note - could be converted to use the Tika HTML parser,
- * but we'd potentially need a custom text handler to replicate
- * the current settings around links and non-breaking spaces.
- * </p>
- *
- * @author Derek Hulley
- * @author eknizat
- * @see <a href="http://htmlparser.sourceforge.net/">http://htmlparser.sourceforge.net</a>
- * @see org.htmlparser.beans.StringBean
- * @see <a href="http://sourceforge.net/tracker/?func=detail&aid=1644504&group_id=24399&atid=381401">HTML Parser</a>
- */
-@Component
-public class HtmlParserContentTransformer implements CustomTransformerFileAdaptor
-{
-    private static final Logger logger = LoggerFactory.getLogger(
-        HtmlParserContentTransformer.class);
-
-    @Override
-    public String getTransformerName()
-    {
-        return "html";
-    }
-
-    @Override
-    public void transform(final String sourceMimetype, final String targetMimetype,
-                          final Map<String, String> transformOptions,
-                          final File sourceFile, final File targetFile, TransformManager transformManager) throws Exception
-    {
-        String sourceEncoding = transformOptions.get(SOURCE_ENCODING);
-        checkEncodingParameter(sourceEncoding, SOURCE_ENCODING);
-
-        if (logger.isDebugEnabled())
-        {
-            logger.debug("Performing HTML to text transform with sourceEncoding=" + sourceEncoding);
-        }
-
-        // Create the extractor
-        EncodingAwareStringBean extractor = new EncodingAwareStringBean();
-        extractor.setCollapse(false);
-        extractor.setLinks(false);
-        extractor.setReplaceNonBreakingSpaces(false);
-        extractor.setURL(sourceFile, sourceEncoding);
-        // get the text
-        String text = extractor.getStrings();
-
-        // write it to the writer
-        try (Writer writer = new BufferedWriter(
-            new OutputStreamWriter(new FileOutputStream(targetFile))))
-        {
-            writer.write(text);
-        }
-    }
-
-    private void checkEncodingParameter(String encoding, String parameterName)
-    {
-        try
-        {
-            if (encoding != null && !Charset.isSupported(encoding))
-            {
-                throw new IllegalArgumentException(
-                    parameterName + "=" + encoding + " is not supported by the JVM.");
-            }
-        }
-        catch (IllegalCharsetNameException e)
-        {
-            throw new IllegalArgumentException(
-                parameterName + "=" + encoding + " is not a valid encoding.");
-        }
-    }
-
-    /**
-     * <p>
-     * This code is based on a class of the same name, originally implemented in alfresco-repository.
-     * </p>
-     *
-     * A version of {@link StringBean} which allows control of the
-     * encoding in the underlying HTML Parser.
-     * Unfortunately, StringBean doesn't allow easy over-riding of
-     * this, so we have to duplicate some code to control this.
-     * This allows us to correctly handle HTML files where the encoding
-     * is specified against the content property (rather than in the
-     * HTML Head Meta), see ALF-10466 for details.
-     */
-    public static class EncodingAwareStringBean extends StringBean
-    {
-        private static final long serialVersionUID = -9033414360428669553L;
-
-        /**
-         * Sets the File to extract strings from, and the encoding
-         * it's in (if known to Alfresco)
-         *
-         * @param file     The File that text should be fetched from.
-         * @param encoding The encoding of the input
-         */
-        public void setURL(File file, String encoding)
-        {
-            String previousURL = getURL();
-            String newURL = file.getAbsolutePath();
-
-            if (previousURL == null || !newURL.equals(previousURL))
-            {
-                try
-                {
-                    URLConnection conn = getConnection();
-
-                    if (null == mParser)
-                    {
-                        mParser = new Parser(newURL);
-                    }
-                    else
-                    {
-                        mParser.setURL(newURL);
-                    }
-
-                    if (encoding != null)
-                    {
-                        mParser.setEncoding(encoding);
-                    }
-
-                    mPropertySupport.firePropertyChange(StringBean.PROP_URL_PROPERTY, previousURL,
-                        getURL());
-                    mPropertySupport.firePropertyChange(StringBean.PROP_CONNECTION_PROPERTY, conn,
-                        mParser.getConnection());
-                    setStrings();
-                }
-                catch (ParserException pe)
-                {
-                    updateStrings(pe.toString());
-                }
-            }
-        }
-
-        public String getEncoding()
-        {
-            return mParser.getEncoding();
-        }
-    }
-}
+/*
+ * #%L
+ * Alfresco Transform Core
+ * %%
+ * Copyright (C) 2005 - 2025 Alfresco Software Limited
+ * %%
+ * This file is part of the Alfresco software.
+ * -
+ * If the software was purchased under a paid Alfresco license, the terms of
+ * the paid license agreement will prevail.  Otherwise, the software is
+ * provided under the following open source license terms:
+ * -
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * -
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ * -
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ * #L%
+ */
+package org.alfresco.transform.misc.transformers;
+
+import static org.alfresco.transform.common.RequestParamMap.HTML_COLLAPSE;
+import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.net.URLConnection;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.util.Map;
+
+import org.htmlparser.Parser;
+import org.htmlparser.beans.StringBean;
+import org.htmlparser.util.ParserException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Component;
+
+import org.alfresco.transform.base.TransformManager;
+import org.alfresco.transform.base.util.CustomTransformerFileAdaptor;
+
+/**
+ * Content transformer which wraps the HTML Parser library for parsing HTML content.
+ *
+ * <p>
+ * This code is based on a class of the same name originally implemented in alfresco-repository.
+ * </p>
+ *
+ * <p>
+ * Since HTML Parser was updated from v1.6 to v2.1, META tags defining an encoding for the content via http-equiv=Content-Type will ONLY be respected if the encoding of the content item itself is set to ISO-8859-1.
+ * </p>
+ *
+ * <p>
+ * Tika Note - could be converted to use the Tika HTML parser, but we'd potentially need a custom text handler to replicate the current settings around links and non-breaking spaces.
+ * </p>
+ *
+ * @author Derek Hulley
+ * @author eknizat
+ * @see <a href="http://htmlparser.sourceforge.net/">http://htmlparser.sourceforge.net</a>
+ * @see org.htmlparser.beans.StringBean
+ * @see <a href="http://sourceforge.net/tracker/?func=detail&aid=1644504&group_id=24399&atid=381401">HTML Parser</a>
+ */
+@Component
+public class HtmlParserContentTransformer implements CustomTransformerFileAdaptor
+{
+    private static final Logger logger = LoggerFactory.getLogger(
+            HtmlParserContentTransformer.class);
+
+    @Value("${transform.core.misc.htmlOptions.collapseHtml:true}")
+    private String collapseOptionDefault;
+
+    @Override
+    public String getTransformerName()
+    {
+        return "html";
+    }
+
+    @Override
+    public void transform(final String sourceMimetype, final String targetMimetype,
+            final Map<String, String> transformOptions,
+            final File sourceFile, final File targetFile, TransformManager transformManager) throws Exception
+    {
+        String sourceEncoding = transformOptions.get(SOURCE_ENCODING);
+        checkEncodingParameter(sourceEncoding, SOURCE_ENCODING);
+        boolean collapse;
+
+        var collapseOption = transformOptions.get(HTML_COLLAPSE);
+        // If the collapse option is set, use it, otherwise use the default value
+        if (collapseOption != null && (collapseOption.trim().equalsIgnoreCase("true") || collapseOption.trim().equalsIgnoreCase("false")))
+        {
+            collapse = Boolean.parseBoolean(collapseOption);
+        }
+        else
+        {
+            // Use the default value from the configuration
+            collapse = collapseOptionDefault == null || Boolean.parseBoolean(collapseOptionDefault);
+            if (logger.isDebugEnabled())
+            {
+                logger.debug("Using default html collapse option: " + collapseOptionDefault);
+            }
+        }
+
+        if (logger.isDebugEnabled())
+        {
+            logger.debug("Performing HTML to text transform with sourceEncoding=" + sourceEncoding);
+        }
+
+        // Create the extractor
+        EncodingAwareStringBean extractor = new EncodingAwareStringBean();
+        extractor.setCollapse(collapse);
+        extractor.setLinks(false);
+        extractor.setReplaceNonBreakingSpaces(false);
+        extractor.setURL(sourceFile, sourceEncoding);
+        // get the text
+        String text = extractor.getStrings();
+
+        // write it to the writer
+        try (Writer writer = new BufferedWriter(
+                new OutputStreamWriter(new FileOutputStream(targetFile))))
+        {
+            writer.write(text);
+        }
+    }
+
+    private void checkEncodingParameter(String encoding, String parameterName)
+    {
+        try
+        {
+            if (encoding != null && !Charset.isSupported(encoding))
+            {
+                throw new IllegalArgumentException(
+                        parameterName + "=" + encoding + " is not supported by the JVM.");
+            }
+        }
+        catch (IllegalCharsetNameException e)
+        {
+            throw new IllegalArgumentException(
+                    parameterName + "=" + encoding + " is not a valid encoding.");
+        }
+    }
+
+    /**
+     * <p>
+     * This code is based on a class of the same name, originally implemented in alfresco-repository.
+     * </p>
+     *
+     * A version of {@link StringBean} which allows control of the encoding in the underlying HTML Parser. Unfortunately, StringBean doesn't allow easy over-riding of this, so we have to duplicate some code to control this. This allows us to correctly handle HTML files where the encoding is specified against the content property (rather than in the HTML Head Meta), see ALF-10466 for details.
+     */
+    public static class EncodingAwareStringBean extends StringBean
+    {
+        private static final long serialVersionUID = -9033414360428669553L;
+
+        /**
+         * Sets the File to extract strings from, and the encoding it's in (if known to Alfresco)
+         *
+         * @param file
+         *            The File that text should be fetched from.
+         * @param encoding
+         *            The encoding of the input
+         */
+        public void setURL(File file, String encoding)
+        {
+            String previousURL = getURL();
+            String newURL = file.getAbsolutePath();
+
+            if (previousURL == null || !newURL.equals(previousURL))
+            {
+                try
+                {
+                    URLConnection conn = getConnection();
+
+                    if (null == mParser)
+                    {
+                        mParser = new Parser(newURL);
+                    }
+                    else
+                    {
+                        mParser.setURL(newURL);
+                    }
+
+                    if (encoding != null)
+                    {
+                        mParser.setEncoding(encoding);
+                    }
+
+                    mPropertySupport.firePropertyChange(StringBean.PROP_URL_PROPERTY, previousURL,
+                            getURL());
+                    mPropertySupport.firePropertyChange(StringBean.PROP_CONNECTION_PROPERTY, conn,
+                            mParser.getConnection());
+                    setStrings();
+                }
+                catch (ParserException pe)
+                {
+                    updateStrings(pe.toString());
+                }
+            }
+        }
+
+        public String getEncoding()
+        {
+            return mParser.getEncoding();
+        }
+    }
+}
--- a/engines/misc/src/main/resources/application-default.yaml
+++ b/engines/misc/src/main/resources/application-default.yaml
@@ -4,4 +4,6 @@ transform:
  core:
    misc:
      pdfBox:
-        defaultFont: ${MISC_PDFBOX_DEFAULT_FONT:NotoSans-Regular}
+        defaultFont: ${MISC_PDFBOX_DEFAULT_FONT:NotoSans-Regular}
+      htmlOptions:
+        collapseHtml: ${MISC_HTML_COLLAPSE:true}
--- a/engines/misc/src/main/resources/misc_engine_config.json
+++ b/engines/misc/src/main/resources/misc_engine_config.json
@@ -1,5 +1,8 @@
 {
  "transformOptions": {
+    "htmlOptions": [
+      {"value": {"name": "collapseHtml"}}
+    ],
    "textToPdfOptions": [
      {"value": {"name": "pageLimit"}},
      {"value": {"name": "pdfFont"}},
@@ -24,8 +27,7 @@
      "supportedSourceAndTargetList": [
        {"sourceMediaType": "text/html",                                     "targetMediaType": "text/plain"}
      ],
-      "transformOptions": [
-       ]
+      "transformOptions": ["htmlOptions"]
    },
    {
      "transformerName": "string",
--- a/engines/misc/src/test/java/org/alfresco/transform/misc/MiscTest.java
+++ b/engines/misc/src/test/java/org/alfresco/transform/misc/MiscTest.java
--- a/engines/misc/src/test/java/org/alfresco/transform/misc/transformers/HtmlParserContentTransformerTest.java
+++ b/engines/misc/src/test/java/org/alfresco/transform/misc/transformers/HtmlParserContentTransformerTest.java
@@ -1,162 +1,300 @@
-/*
- * #%L
- * Alfresco Transform Core
- * %%
- * Copyright (C) 2005 - 2022 Alfresco Software Limited
- * %%
- * This file is part of the Alfresco software.
- * -
- * If the software was purchased under a paid Alfresco license, the terms of
- * the paid license agreement will prevail.  Otherwise, the software is
- * provided under the following open source license terms:
- * -
- * Alfresco is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * -
- * Alfresco is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- * -
- * You should have received a copy of the GNU Lesser General Public License
- * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
- * #L%
- */
-package org.alfresco.transform.misc.transformers;
-
-import org.junit.jupiter.api.Test;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStreamWriter;
-import java.nio.file.Files;
-import java.util.HashMap;
-import java.util.Map;
-
-import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-public class HtmlParserContentTransformerTest
-{
-    private static final String SOURCE_MIMETYPE = "text/html";
-    private static final String TARGET_MIMETYPE = "text/plain";
-
-    HtmlParserContentTransformer transformer = new HtmlParserContentTransformer();
-
-    /**
-     * Checks that we correctly handle text in different encodings,
-     * no matter if the encoding is specified on the Content Property
-     * or in a meta tag within the HTML itself. (ALF-10466)
-     *
-     * On Windows, org.htmlparser.beans.StringBean.carriageReturn() appends a new system dependent new line
-     * so we must be careful when checking the returned text
-     */
-    @Test
-    public void testEncodingHandling() throws Exception
-    {
-        final String NEWLINE = System.getProperty("line.separator");
-        final String TITLE = "Testing!";
-        final String TEXT_P1 = "This is some text in English";
-        final String TEXT_P2 = "This is more text in English";
-        final String TEXT_P3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
-        String partA = "<html><head><title>" + TITLE + "</title></head>" + NEWLINE;
-        String partB = "<body><p>" + TEXT_P1 + "</p>" + NEWLINE +
-                       "<p>" + TEXT_P2 + "</p>" + NEWLINE +
-                       "<p>" + TEXT_P3 + "</p>" + NEWLINE;
-        String partC = "</body></html>";
-        final String expected = TITLE + NEWLINE + TEXT_P1 + NEWLINE + TEXT_P2 + NEWLINE + TEXT_P3 + NEWLINE;
-
-        File tmpS = null;
-        File tmpD = null;
-
-        try
-        {
-            // Content set to ISO 8859-1
-            tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
-            writeToFile(tmpS, partA + partB + partC, "ISO-8859-1");
-
-            tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
-
-            Map<String, String> parameters = new HashMap<>();
-            parameters.put(SOURCE_ENCODING, "ISO-8859-1");
-            transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
-
-            assertEquals(expected, readFromFile(tmpD, "UTF-8"));
-            tmpS.delete();
-            tmpD.delete();
-
-            // Content set to UTF-8
-            tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
-            writeToFile(tmpS, partA + partB + partC, "UTF-8");
-
-            tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
-            parameters = new HashMap<>();
-            parameters.put(SOURCE_ENCODING, "UTF-8");
-            transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
-            assertEquals(expected, readFromFile(tmpD, "UTF-8"));
-            tmpS.delete();
-            tmpD.delete();
-
-            // Content set to UTF-16
-            tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
-            writeToFile(tmpS, partA + partB + partC, "UTF-16");
-
-            tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
-            parameters = new HashMap<>();
-            parameters.put(SOURCE_ENCODING, "UTF-16");
-            transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
-            assertEquals(expected, readFromFile(tmpD, "UTF-8"));
-            tmpS.delete();
-            tmpD.delete();
-
-            // Note - since HTML Parser 2.0 META tags specifying the
-            // document encoding will ONLY be respected if the original
-            // content type was set to ISO-8859-1.
-            //
-            // This means there is now only one test which we can perform
-            // to ensure that this now-limited overriding of the encoding
-            // takes effect.
-
-            // Content set to ISO 8859-1, meta set to UTF-8
-            tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
-            String str = partA +
-                         "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">" +
-                         partB + partC;
-
-            writeToFile(tmpS, str, "UTF-8");
-
-            tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
-
-            parameters = new HashMap<>();
-            parameters.put(SOURCE_ENCODING, "ISO-8859-1");
-            transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
-            assertEquals(expected, readFromFile(tmpD, "UTF-8"));
-            tmpS.delete();
-            tmpD.delete();
-
-            // Note - we can't test UTF-16 with only a meta encoding,
-            //  because without that the parser won't know about the
-            //  2 byte format so won't be able to identify the meta tag
-        }
-        finally
-        {
-            if (tmpS != null && tmpS.exists()) tmpS.delete();
-            if (tmpD != null && tmpD.exists()) tmpD.delete();
-        }
-    }
-
-    private void writeToFile(File file, String content, String encoding) throws Exception
-    {
-        try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
-        {
-            ow.append(content);
-        }
-    }
-
-    private String readFromFile(File file, final String encoding) throws Exception
-    {
-        return new String(Files.readAllBytes(file.toPath()), encoding);
-    }
-}
+/*
+ * #%L
+ * Alfresco Transform Core
+ * %%
+ * Copyright (C) 2005 - 2025 Alfresco Software Limited
+ * %%
+ * This file is part of the Alfresco software.
+ * -
+ * If the software was purchased under a paid Alfresco license, the terms of
+ * the paid license agreement will prevail.  Otherwise, the software is
+ * provided under the following open source license terms:
+ * -
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * -
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ * -
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
+ * #L%
+ */
+package org.alfresco.transform.misc.transformers;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.fail;
+
+import static org.alfresco.transform.common.RequestParamMap.HTML_COLLAPSE;
+import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.nio.file.Files;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+public class HtmlParserContentTransformerTest
+{
+    private static final String SOURCE_MIMETYPE = "text/html";
+    private static final String TARGET_MIMETYPE = "text/plain";
+
+    /**
+     * Checks that we correctly handle text in different encodings, no matter if the encoding is specified on the Content Property or in a meta tag within the HTML itself. (ALF-10466)
+     *
+     * On Windows, org.htmlparser.beans.StringBean.carriageReturn() appends a new system dependent new line so we must be careful when checking the returned text
+     */
+    @Test
+    public void testEncodingHandling() throws Exception
+    {
+        final HtmlParserContentTransformer transformer = new HtmlParserContentTransformer();
+        final String newline = System.getProperty("line.separator");
+        final String title = "Testing!";
+        final String textp1 = "This is some text in English";
+        final String textp2 = "This is more text in English";
+        final String textp3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
+        String partA = "<html><head><title>" + title + "</title></head>" + newline;
+        String partB = "<body><p>" + textp1 + "</p>" + newline +
+                "<p>" + textp2 + "</p>" + newline +
+                "<p>" + textp3 + "</p>" + newline;
+        String partC = "</body></html>";
+        final String expected = title + newline + textp1 + newline + textp2 + newline + textp3;
+
+        File tmpS = null;
+        File tmpD = null;
+
+        try
+        {
+            // Content set to ISO 8859-1
+            tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
+            writeToFile(tmpS, partA + partB + partC, "ISO-8859-1");
+
+            tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
+
+            Map<String, String> parameters = new HashMap<>();
+            parameters.put(SOURCE_ENCODING, "ISO-8859-1");
+            parameters.put(HTML_COLLAPSE, String.valueOf(true));
+            transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
+
+            assertEquals(expected, readFromFile(tmpD, "UTF-8"));
+            tmpS.delete();
+            tmpD.delete();
+
+            // Content set to UTF-8
+            tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
+            writeToFile(tmpS, partA + partB + partC, "UTF-8");
+
+            tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
+            parameters = new HashMap<>();
+            parameters.put(SOURCE_ENCODING, "UTF-8");
+            parameters.put(HTML_COLLAPSE, String.valueOf(true));
+            transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
+            assertEquals(expected, readFromFile(tmpD, "UTF-8"));
+            tmpS.delete();
+            tmpD.delete();
+
+            // Content set to UTF-16
+            tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
+            writeToFile(tmpS, partA + partB + partC, "UTF-16");
+
+            tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
+            parameters = new HashMap<>();
+            parameters.put(HTML_COLLAPSE, String.valueOf(true));
+            parameters.put(SOURCE_ENCODING, "UTF-16");
+            transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
+            assertEquals(expected, readFromFile(tmpD, "UTF-8"));
+            tmpS.delete();
+            tmpD.delete();
+
+            // Note - since HTML Parser 2.0 META tags specifying the
+            // document encoding will ONLY be respected if the original
+            // content type was set to ISO-8859-1.
+            //
+            // This means there is now only one test which we can perform
+            // to ensure that this now-limited overriding of the encoding
+            // takes effect.
+
+            // Content set to ISO 8859-1, meta set to UTF-8
+            tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
+            String str = partA +
+                    "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">" +
+                    partB + partC;
+
+            writeToFile(tmpS, str, "UTF-8");
+
+            tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
+
+            parameters = new HashMap<>();
+            parameters.put(SOURCE_ENCODING, "ISO-8859-1");
+            parameters.put(HTML_COLLAPSE, String.valueOf(true));
+            transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
+            assertEquals(expected, readFromFile(tmpD, "UTF-8"));
+            tmpS.delete();
+            tmpD.delete();
+
+            // Note - we can't test UTF-16 with only a meta encoding,
+            // because without that the parser won't know about the
+            // 2 byte format so won't be able to identify the meta tag
+        }
+        catch (Exception e)
+        {
+            fail("Test Failed: " + e.getMessage()); // fail the test if any exception occurs
+        }
+        finally
+        {
+            if (tmpS != null && tmpS.exists())
+            {
+                tmpS.delete();
+            }
+            if (tmpD != null && tmpD.exists())
+            {
+                tmpD.delete();
+            }
+        }
+    }
+
+    /**
+     * Tests the transformer with different collapsing methods. If the collapsing is set to false, it should not collapse the new lines between paragraphs. If the collapsing is set to true, it should collapse the new lines.
+     */
+    @ParameterizedTest
+    @ValueSource(booleans = {true, false})
+    public void testTransformerWithDifferentCollapsingMethods(boolean shouldCollapse)
+    {
+        final HtmlParserContentTransformer transformer = new HtmlParserContentTransformer();
+
+        final String newline = System.getProperty("line.separator");
+        final String title = "Testing!";
+        final String textp1 = "This is some text in English";
+        final String textp2 = "This is more text in English";
+        final String textp3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
+        String partA = "<html><head><title>" + title + "</title></head>" + newline;
+        String partB = "<body><p>" + textp1 + "</p>" + newline +
+                "<p>" + textp2 + "</p>" + newline +
+                "<p>" + textp3 + "</p>" + newline;
+        String partC = "</body></html>";
+        final String expected = title + newline + textp1 + newline + textp2 + newline + textp3 + (shouldCollapse ? "" : newline); // Just a added newline if collapsing is not collapsing
+
+        File tmpS = null;
+        File tmpD = null;
+
+        try
+        {
+            tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
+            writeToFile(tmpS, partA + partB + partC, "UTF-8");
+
+            tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
+            Map<String, String> parameters = new HashMap<>();
+            parameters.put(SOURCE_ENCODING, "UTF-8");
+            parameters.put(HTML_COLLAPSE, String.valueOf(shouldCollapse));
+            transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
+            assertEquals(expected, readFromFile(tmpD, "UTF-8"));
+            tmpS.delete();
+            tmpD.delete();
+        }
+        catch (Exception e)
+        {
+            fail("Test Failed: " + e.getMessage()); // fail the test if any exception occurs
+        }
+        finally
+        {
+            if (tmpS != null && tmpS.exists())
+            {
+                tmpS.delete();
+            }
+            if (tmpD != null && tmpD.exists())
+            {
+                tmpD.delete();
+            }
+        }
+    }
+
+    /**
+     * Tests the transformer with wrong boolean values for the collapse option. It should not throw an exception and should use the default value for collapsing.
+     */
+
+    @ParameterizedTest
+    @ValueSource(strings = {"cat", "dog", "", "1234abcd", "@#$%"})
+    public void testTransformerWithWrongBooleanValues(String booleanValues)
+    {
+        final HtmlParserContentTransformer transformer = new HtmlParserContentTransformer();
+
+        final String newline = System.getProperty("line.separator");
+        final String title = "Testing!";
+        final String textp1 = "This is some text in English";
+        final String textp2 = "This is more text in English";
+        final String textp3 = "C'est en Fran\u00e7ais et Espa\u00f1ol";
+        String partA = "<html><head><title>" + title + "</title></head>" + newline;
+        String partB = "<body><p>" + textp1 + "</p>" + newline +
+                "<p>" + textp2 + "</p>" + newline +
+                "<p>" + textp3 + "</p>" + newline;
+        String partC = "</body></html>";
+        final String expected = title + newline + textp1 + newline + textp2 + newline + textp3;
+
+        File tmpS = null;
+        File tmpD = null;
+
+        try
+        {
+            tmpS = File.createTempFile("AlfrescoTestSource_", ".html");
+            writeToFile(tmpS, partA + partB + partC, "UTF-8");
+
+            tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
+            Map<String, String> parameters = new HashMap<>();
+            parameters.put(SOURCE_ENCODING, "UTF-8");
+            parameters.put(HTML_COLLAPSE, booleanValues);
+            transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD, null);
+            assertEquals(expected, readFromFile(tmpD, "UTF-8"));
+            tmpS.delete();
+            tmpD.delete();
+        }
+        catch (Exception e)
+        {
+            fail("Test Failed: " + e.getMessage()); // fail the test if any exception occurs
+        }
+        finally
+        {
+            if (tmpS != null && tmpS.exists())
+            {
+                tmpS.delete();
+            }
+            if (tmpD != null && tmpD.exists())
+            {
+                tmpD.delete();
+            }
+        }
+    }
+
+    private void writeToFile(File file, String content, String encoding)
+    {
+        try (OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream(file), encoding))
+        {
+            ow.append(content);
+        }
+        catch (Exception e)
+        {
+            fail("Failed to write to file: " + e.getMessage()); // fail the test if any exception occurs
+        }
+    }
+
+    private String readFromFile(File file, final String encoding)
+    {
+        try
+        {
+            return new String(Files.readAllBytes(file.toPath()), encoding);
+        }
+        catch (Exception e)
+        {
+            fail("Failed to read from file: " + e.getMessage());
+            return null; // Return null if there is an error reading the file
+        }
+    }
+}
--- a/engines/misc/src/test/resources/misc_engine_config.json
+++ b/engines/misc/src/test/resources/misc_engine_config.json
@@ -1,5 +1,8 @@
 {
  "transformOptions": {
+    "htmlOptions": [
+      {"value": {"name": "collapseHtml"}}
+    ],
    "textToPdfOptions": [
      {"value": {"name": "pageLimit"}}
    ],
@@ -17,6 +20,7 @@
        {"sourceMediaType": "text/html",                                     "targetMediaType": "text/plain"}
      ],
      "transformOptions": [
+        "htmlOptions"
      ]
    },
    {
--- a/engines/misc/src/test/resources/quick3.txt
+++ b/engines/misc/src/test/resources/quick3.txt
@@ -0,0 +1,2 @@
+The quick brown fox jumps over the lazy dog
+The quick brown fox jumps over the lazy dog