mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-10-01 14:41:17 +00:00
[MNT-25089] file encoding changed
This commit is contained in:
@@ -1,203 +1,203 @@
|
|||||||
/*
|
/*
|
||||||
* #%L
|
* #%L
|
||||||
* Alfresco Transform Core
|
* Alfresco Transform Core
|
||||||
* %%
|
* %%
|
||||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||||
* %%
|
* %%
|
||||||
* This file is part of the Alfresco software.
|
* This file is part of the Alfresco software.
|
||||||
* -
|
* -
|
||||||
* If the software was purchased under a paid Alfresco license, the terms of
|
* If the software was purchased under a paid Alfresco license, the terms of
|
||||||
* the paid license agreement will prevail. Otherwise, the software is
|
* the paid license agreement will prevail. Otherwise, the software is
|
||||||
* provided under the following open source license terms:
|
* provided under the following open source license terms:
|
||||||
* -
|
* -
|
||||||
* Alfresco is free software: you can redistribute it and/or modify
|
* Alfresco is free software: you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU Lesser General Public License as published by
|
* it under the terms of the GNU Lesser General Public License as published by
|
||||||
* the Free Software Foundation, either version 3 of the License, or
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
* (at your option) any later version.
|
* (at your option) any later version.
|
||||||
* -
|
* -
|
||||||
* Alfresco is distributed in the hope that it will be useful,
|
* Alfresco is distributed in the hope that it will be useful,
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
* GNU Lesser General Public License for more details.
|
* GNU Lesser General Public License for more details.
|
||||||
* -
|
* -
|
||||||
* You should have received a copy of the GNU Lesser General Public License
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||||
* #L%
|
* #L%
|
||||||
*/
|
*/
|
||||||
package org.alfresco.transform.misc.transformers;
|
package org.alfresco.transform.misc.transformers;
|
||||||
|
|
||||||
import org.alfresco.transform.base.TransformManager;
|
import org.alfresco.transform.base.TransformManager;
|
||||||
import org.alfresco.transform.base.util.CustomTransformerFileAdaptor;
|
import org.alfresco.transform.base.util.CustomTransformerFileAdaptor;
|
||||||
import org.htmlparser.Parser;
|
import org.htmlparser.Parser;
|
||||||
import org.htmlparser.beans.StringBean;
|
import org.htmlparser.beans.StringBean;
|
||||||
import org.htmlparser.util.ParserException;
|
import org.htmlparser.util.ParserException;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
import java.io.BufferedWriter;
|
import java.io.BufferedWriter;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.OutputStreamWriter;
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.Writer;
|
import java.io.Writer;
|
||||||
import java.net.URLConnection;
|
import java.net.URLConnection;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.nio.charset.IllegalCharsetNameException;
|
import java.nio.charset.IllegalCharsetNameException;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
|
import static org.alfresco.transform.common.RequestParamMap.SOURCE_ENCODING;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Content transformer which wraps the HTML Parser library for
|
* Content transformer which wraps the HTML Parser library for
|
||||||
* parsing HTML content.
|
* parsing HTML content.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* This code is based on a class of the same name originally implemented in alfresco-repository.
|
* This code is based on a class of the same name originally implemented in alfresco-repository.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Since HTML Parser was updated from v1.6 to v2.1, META tags
|
* Since HTML Parser was updated from v1.6 to v2.1, META tags
|
||||||
* defining an encoding for the content via http-equiv=Content-Type
|
* defining an encoding for the content via http-equiv=Content-Type
|
||||||
* will ONLY be respected if the encoding of the content item
|
* will ONLY be respected if the encoding of the content item
|
||||||
* itself is set to ISO-8859-1.
|
* itself is set to ISO-8859-1.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Tika Note - could be converted to use the Tika HTML parser,
|
* Tika Note - could be converted to use the Tika HTML parser,
|
||||||
* but we'd potentially need a custom text handler to replicate
|
* but we'd potentially need a custom text handler to replicate
|
||||||
* the current settings around links and non-breaking spaces.
|
* the current settings around links and non-breaking spaces.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @author Derek Hulley
|
* @author Derek Hulley
|
||||||
* @author eknizat
|
* @author eknizat
|
||||||
* @see <a href="http://htmlparser.sourceforge.net/">http://htmlparser.sourceforge.net</a>
|
* @see <a href="http://htmlparser.sourceforge.net/">http://htmlparser.sourceforge.net</a>
|
||||||
* @see org.htmlparser.beans.StringBean
|
* @see org.htmlparser.beans.StringBean
|
||||||
* @see <a href="http://sourceforge.net/tracker/?func=detail&aid=1644504&group_id=24399&atid=381401">HTML Parser</a>
|
* @see <a href="http://sourceforge.net/tracker/?func=detail&aid=1644504&group_id=24399&atid=381401">HTML Parser</a>
|
||||||
*/
|
*/
|
||||||
@Component
|
@Component
|
||||||
public class HtmlParserContentTransformer implements CustomTransformerFileAdaptor
|
public class HtmlParserContentTransformer implements CustomTransformerFileAdaptor
|
||||||
{
|
{
|
||||||
private static final Logger logger = LoggerFactory.getLogger(
|
private static final Logger logger = LoggerFactory.getLogger(
|
||||||
HtmlParserContentTransformer.class);
|
HtmlParserContentTransformer.class);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getTransformerName()
|
public String getTransformerName()
|
||||||
{
|
{
|
||||||
return "html";
|
return "html";
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void transform(final String sourceMimetype, final String targetMimetype,
|
public void transform(final String sourceMimetype, final String targetMimetype,
|
||||||
final Map<String, String> transformOptions,
|
final Map<String, String> transformOptions,
|
||||||
final File sourceFile, final File targetFile, TransformManager transformManager) throws Exception
|
final File sourceFile, final File targetFile, TransformManager transformManager) throws Exception
|
||||||
{
|
{
|
||||||
String sourceEncoding = transformOptions.get(SOURCE_ENCODING);
|
String sourceEncoding = transformOptions.get(SOURCE_ENCODING);
|
||||||
checkEncodingParameter(sourceEncoding, SOURCE_ENCODING);
|
checkEncodingParameter(sourceEncoding, SOURCE_ENCODING);
|
||||||
|
|
||||||
if (logger.isDebugEnabled())
|
if (logger.isDebugEnabled())
|
||||||
{
|
{
|
||||||
logger.debug("Performing HTML to text transform with sourceEncoding=" + sourceEncoding);
|
logger.debug("Performing HTML to text transform with sourceEncoding=" + sourceEncoding);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the extractor
|
// Create the extractor
|
||||||
EncodingAwareStringBean extractor = new EncodingAwareStringBean();
|
EncodingAwareStringBean extractor = new EncodingAwareStringBean();
|
||||||
extractor.setCollapse(true);
|
extractor.setCollapse(true);
|
||||||
extractor.setLinks(false);
|
extractor.setLinks(false);
|
||||||
extractor.setReplaceNonBreakingSpaces(false);
|
extractor.setReplaceNonBreakingSpaces(false);
|
||||||
extractor.setURL(sourceFile, sourceEncoding);
|
extractor.setURL(sourceFile, sourceEncoding);
|
||||||
// get the text
|
// get the text
|
||||||
String text = extractor.getStrings();
|
String text = extractor.getStrings();
|
||||||
|
|
||||||
// write it to the writer
|
// write it to the writer
|
||||||
try (Writer writer = new BufferedWriter(
|
try (Writer writer = new BufferedWriter(
|
||||||
new OutputStreamWriter(new FileOutputStream(targetFile))))
|
new OutputStreamWriter(new FileOutputStream(targetFile))))
|
||||||
{
|
{
|
||||||
writer.write(text);
|
writer.write(text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkEncodingParameter(String encoding, String parameterName)
|
private void checkEncodingParameter(String encoding, String parameterName)
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
if (encoding != null && !Charset.isSupported(encoding))
|
if (encoding != null && !Charset.isSupported(encoding))
|
||||||
{
|
{
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
parameterName + "=" + encoding + " is not supported by the JVM.");
|
parameterName + "=" + encoding + " is not supported by the JVM.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (IllegalCharsetNameException e)
|
catch (IllegalCharsetNameException e)
|
||||||
{
|
{
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
parameterName + "=" + encoding + " is not a valid encoding.");
|
parameterName + "=" + encoding + " is not a valid encoding.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p>
|
||||||
* This code is based on a class of the same name, originally implemented in alfresco-repository.
|
* This code is based on a class of the same name, originally implemented in alfresco-repository.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* A version of {@link StringBean} which allows control of the
|
* A version of {@link StringBean} which allows control of the
|
||||||
* encoding in the underlying HTML Parser.
|
* encoding in the underlying HTML Parser.
|
||||||
* Unfortunately, StringBean doesn't allow easy over-riding of
|
* Unfortunately, StringBean doesn't allow easy over-riding of
|
||||||
* this, so we have to duplicate some code to control this.
|
* this, so we have to duplicate some code to control this.
|
||||||
* This allows us to correctly handle HTML files where the encoding
|
* This allows us to correctly handle HTML files where the encoding
|
||||||
* is specified against the content property (rather than in the
|
* is specified against the content property (rather than in the
|
||||||
* HTML Head Meta), see ALF-10466 for details.
|
* HTML Head Meta), see ALF-10466 for details.
|
||||||
*/
|
*/
|
||||||
public static class EncodingAwareStringBean extends StringBean
|
public static class EncodingAwareStringBean extends StringBean
|
||||||
{
|
{
|
||||||
private static final long serialVersionUID = -9033414360428669553L;
|
private static final long serialVersionUID = -9033414360428669553L;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the File to extract strings from, and the encoding
|
* Sets the File to extract strings from, and the encoding
|
||||||
* it's in (if known to Alfresco)
|
* it's in (if known to Alfresco)
|
||||||
*
|
*
|
||||||
* @param file The File that text should be fetched from.
|
* @param file The File that text should be fetched from.
|
||||||
* @param encoding The encoding of the input
|
* @param encoding The encoding of the input
|
||||||
*/
|
*/
|
||||||
public void setURL(File file, String encoding)
|
public void setURL(File file, String encoding)
|
||||||
{
|
{
|
||||||
String previousURL = getURL();
|
String previousURL = getURL();
|
||||||
String newURL = file.getAbsolutePath();
|
String newURL = file.getAbsolutePath();
|
||||||
|
|
||||||
if (previousURL == null || !newURL.equals(previousURL))
|
if (previousURL == null || !newURL.equals(previousURL))
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
URLConnection conn = getConnection();
|
URLConnection conn = getConnection();
|
||||||
|
|
||||||
if (null == mParser)
|
if (null == mParser)
|
||||||
{
|
{
|
||||||
mParser = new Parser(newURL);
|
mParser = new Parser(newURL);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
mParser.setURL(newURL);
|
mParser.setURL(newURL);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (encoding != null)
|
if (encoding != null)
|
||||||
{
|
{
|
||||||
mParser.setEncoding(encoding);
|
mParser.setEncoding(encoding);
|
||||||
}
|
}
|
||||||
|
|
||||||
mPropertySupport.firePropertyChange(StringBean.PROP_URL_PROPERTY, previousURL,
|
mPropertySupport.firePropertyChange(StringBean.PROP_URL_PROPERTY, previousURL,
|
||||||
getURL());
|
getURL());
|
||||||
mPropertySupport.firePropertyChange(StringBean.PROP_CONNECTION_PROPERTY, conn,
|
mPropertySupport.firePropertyChange(StringBean.PROP_CONNECTION_PROPERTY, conn,
|
||||||
mParser.getConnection());
|
mParser.getConnection());
|
||||||
setStrings();
|
setStrings();
|
||||||
}
|
}
|
||||||
catch (ParserException pe)
|
catch (ParserException pe)
|
||||||
{
|
{
|
||||||
updateStrings(pe.toString());
|
updateStrings(pe.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getEncoding()
|
public String getEncoding()
|
||||||
{
|
{
|
||||||
return mParser.getEncoding();
|
return mParser.getEncoding();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user