[MNT-25089] file encoding changed

This commit is contained in:
KushalBanik
2025-05-21 15:50:55 +05:30
parent a4332c0328
commit 3c7094b2d7

View File

@@ -1,203 +1,203 @@
/* /*
* #%L * #%L
* Alfresco Transform Core * Alfresco Transform Core
* %% * %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited * Copyright (C) 2005 - 2022 Alfresco Software Limited
* %% * %%
* This file is part of the Alfresco software. * This file is part of the Alfresco software.
* - * -
* If the software was purchased under a paid Alfresco license, the terms of * If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is * the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms: * provided under the following open source license terms:
* - * -
* Alfresco is free software: you can redistribute it and/or modify * Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by * it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version. * (at your option) any later version.
* - * -
* Alfresco is distributed in the hope that it will be useful, * Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details. * GNU Lesser General Public License for more details.
* - * -
* You should have received a copy of the GNU Lesser General Public License * You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.misc.transformers; package org.alfresco.transform.misc.transformers;
import org.alfresco.transform.base.TransformManager; import org.alfresco.transform.base.TransformManager;
import org.alfresco.transform.base.util.CustomTransformerFileAdaptor; import org.alfresco.transform.base.util.CustomTransformerFileAdaptor;
import org.htmlparser.Parser; import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean; import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException; import org.htmlparser.util.ParserException;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.BufferedWriter; import java.io.BufferedWriter;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.OutputStreamWriter; import java.io.OutputStreamWriter;
import java.io.Writer; import java.io.Writer;
import java.net.URLConnection; import java.net.URLConnection;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.IllegalCharsetNameException;
import java.util.Map; import java.util.Map;
import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING; import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
/** /**
* Content transformer which wraps the HTML Parser library for * Content transformer which wraps the HTML Parser library for
* parsing HTML content. * parsing HTML content.
* *
* <p> * <p>
* This code is based on a class of the same name originally implemented in alfresco-repository. * This code is based on a class of the same name originally implemented in alfresco-repository.
* </p> * </p>
* *
* <p> * <p>
* Since HTML Parser was updated from v1.6 to v2.1, META tags * Since HTML Parser was updated from v1.6 to v2.1, META tags
* defining an encoding for the content via http-equiv=Content-Type * defining an encoding for the content via http-equiv=Content-Type
* will ONLY be respected if the encoding of the content item * will ONLY be respected if the encoding of the content item
* itself is set to ISO-8859-1. * itself is set to ISO-8859-1.
* </p> * </p>
* *
* <p> * <p>
* Tika Note - could be converted to use the Tika HTML parser, * Tika Note - could be converted to use the Tika HTML parser,
* but we'd potentially need a custom text handler to replicate * but we'd potentially need a custom text handler to replicate
* the current settings around links and non-breaking spaces. * the current settings around links and non-breaking spaces.
* </p> * </p>
* *
* @author Derek Hulley * @author Derek Hulley
* @author eknizat * @author eknizat
* @see <a href="http://htmlparser.sourceforge.net/">http://htmlparser.sourceforge.net</a> * @see <a href="http://htmlparser.sourceforge.net/">http://htmlparser.sourceforge.net</a>
* @see org.htmlparser.beans.StringBean * @see org.htmlparser.beans.StringBean
* @see <a href="http://sourceforge.net/tracker/?func=detail&aid=1644504&group_id=24399&atid=381401">HTML Parser</a> * @see <a href="http://sourceforge.net/tracker/?func=detail&aid=1644504&group_id=24399&atid=381401">HTML Parser</a>
*/ */
@Component @Component
public class HtmlParserContentTransformer implements CustomTransformerFileAdaptor public class HtmlParserContentTransformer implements CustomTransformerFileAdaptor
{ {
private static final Logger logger = LoggerFactory.getLogger( private static final Logger logger = LoggerFactory.getLogger(
HtmlParserContentTransformer.class); HtmlParserContentTransformer.class);
@Override @Override
public String getTransformerName() public String getTransformerName()
{ {
return "html"; return "html";
} }
@Override @Override
public void transform(final String sourceMimetype, final String targetMimetype, public void transform(final String sourceMimetype, final String targetMimetype,
final Map<String, String> transformOptions, final Map<String, String> transformOptions,
final File sourceFile, final File targetFile, TransformManager transformManager) throws Exception final File sourceFile, final File targetFile, TransformManager transformManager) throws Exception
{ {
String sourceEncoding = transformOptions.get(SOURCE_ENCODING); String sourceEncoding = transformOptions.get(SOURCE_ENCODING);
checkEncodingParameter(sourceEncoding, SOURCE_ENCODING); checkEncodingParameter(sourceEncoding, SOURCE_ENCODING);
if (logger.isDebugEnabled()) if (logger.isDebugEnabled())
{ {
logger.debug("Performing HTML to text transform with sourceEncoding=" + sourceEncoding); logger.debug("Performing HTML to text transform with sourceEncoding=" + sourceEncoding);
} }
// Create the extractor // Create the extractor
EncodingAwareStringBean extractor = new EncodingAwareStringBean(); EncodingAwareStringBean extractor = new EncodingAwareStringBean();
extractor.setCollapse(true); extractor.setCollapse(true);
extractor.setLinks(false); extractor.setLinks(false);
extractor.setReplaceNonBreakingSpaces(false); extractor.setReplaceNonBreakingSpaces(false);
extractor.setURL(sourceFile, sourceEncoding); extractor.setURL(sourceFile, sourceEncoding);
// get the text // get the text
String text = extractor.getStrings(); String text = extractor.getStrings();
// write it to the writer // write it to the writer
try (Writer writer = new BufferedWriter( try (Writer writer = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(targetFile)))) new OutputStreamWriter(new FileOutputStream(targetFile))))
{ {
writer.write(text); writer.write(text);
} }
} }
private void checkEncodingParameter(String encoding, String parameterName) private void checkEncodingParameter(String encoding, String parameterName)
{ {
try try
{ {
if (encoding != null && !Charset.isSupported(encoding)) if (encoding != null && !Charset.isSupported(encoding))
{ {
throw new IllegalArgumentException( throw new IllegalArgumentException(
parameterName + "=" + encoding + " is not supported by the JVM."); parameterName + "=" + encoding + " is not supported by the JVM.");
} }
} }
catch (IllegalCharsetNameException e) catch (IllegalCharsetNameException e)
{ {
throw new IllegalArgumentException( throw new IllegalArgumentException(
parameterName + "=" + encoding + " is not a valid encoding."); parameterName + "=" + encoding + " is not a valid encoding.");
} }
} }
/** /**
* <p> * <p>
* This code is based on a class of the same name, originally implemented in alfresco-repository. * This code is based on a class of the same name, originally implemented in alfresco-repository.
* </p> * </p>
* *
* A version of {@link StringBean} which allows control of the * A version of {@link StringBean} which allows control of the
* encoding in the underlying HTML Parser. * encoding in the underlying HTML Parser.
* Unfortunately, StringBean doesn't allow easy over-riding of * Unfortunately, StringBean doesn't allow easy over-riding of
* this, so we have to duplicate some code to control this. * this, so we have to duplicate some code to control this.
* This allows us to correctly handle HTML files where the encoding * This allows us to correctly handle HTML files where the encoding
* is specified against the content property (rather than in the * is specified against the content property (rather than in the
* HTML Head Meta), see ALF-10466 for details. * HTML Head Meta), see ALF-10466 for details.
*/ */
public static class EncodingAwareStringBean extends StringBean public static class EncodingAwareStringBean extends StringBean
{ {
private static final long serialVersionUID = -9033414360428669553L; private static final long serialVersionUID = -9033414360428669553L;
/** /**
* Sets the File to extract strings from, and the encoding * Sets the File to extract strings from, and the encoding
* it's in (if known to Alfresco) * it's in (if known to Alfresco)
* *
* @param file The File that text should be fetched from. * @param file The File that text should be fetched from.
* @param encoding The encoding of the input * @param encoding The encoding of the input
*/ */
public void setURL(File file, String encoding) public void setURL(File file, String encoding)
{ {
String previousURL = getURL(); String previousURL = getURL();
String newURL = file.getAbsolutePath(); String newURL = file.getAbsolutePath();
if (previousURL == null || !newURL.equals(previousURL)) if (previousURL == null || !newURL.equals(previousURL))
{ {
try try
{ {
URLConnection conn = getConnection(); URLConnection conn = getConnection();
if (null == mParser) if (null == mParser)
{ {
mParser = new Parser(newURL); mParser = new Parser(newURL);
} }
else else
{ {
mParser.setURL(newURL); mParser.setURL(newURL);
} }
if (encoding != null) if (encoding != null)
{ {
mParser.setEncoding(encoding); mParser.setEncoding(encoding);
} }
mPropertySupport.firePropertyChange(StringBean.PROP_URL_PROPERTY, previousURL, mPropertySupport.firePropertyChange(StringBean.PROP_URL_PROPERTY, previousURL,
getURL()); getURL());
mPropertySupport.firePropertyChange(StringBean.PROP_CONNECTION_PROPERTY, conn, mPropertySupport.firePropertyChange(StringBean.PROP_CONNECTION_PROPERTY, conn,
mParser.getConnection()); mParser.getConnection());
setStrings(); setStrings();
} }
catch (ParserException pe) catch (ParserException pe)
{ {
updateStrings(pe.toString()); updateStrings(pe.toString());
} }
} }
} }
public String getEncoding() public String getEncoding()
{ {
return mParser.getEncoding(); return mParser.getEncoding();
} }
} }
} }