mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
96 lines
2.9 KiB
Java
96 lines
2.9 KiB
Java
/*
|
|
* #%L
|
|
* Alfresco Data model classes
|
|
* %%
|
|
* Copyright (C) 2005 - 2016 Alfresco Software Limited
|
|
* %%
|
|
* This file is part of the Alfresco software.
|
|
* If the software was purchased under a paid Alfresco license, the terms of
|
|
* the paid license agreement will prevail. Otherwise, the software is
|
|
* provided under the following open source license terms:
|
|
*
|
|
* Alfresco is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Alfresco is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
|
* #L%
|
|
*/
|
|
package org.alfresco.encoding;
|
|
|
|
import java.nio.charset.Charset;
|
|
import java.nio.charset.UnsupportedCharsetException;
|
|
|
|
import org.apache.commons.logging.Log;
|
|
import org.apache.commons.logging.LogFactory;
|
|
import org.apache.tika.parser.txt.CharsetDetector;
|
|
import org.apache.tika.parser.txt.CharsetMatch;
|
|
|
|
/**
|
|
* Uses Apache Tika as a fallback encoding detector
|
|
*
|
|
* @since 3.4
|
|
* @author Nick Burch
|
|
*/
|
|
public class TikaCharsetFinder extends AbstractCharactersetFinder
|
|
{
|
|
private static Log logger = LogFactory.getLog(TikaCharsetFinder.class);
|
|
|
|
private int threshold = 35;
|
|
|
|
@Override
|
|
protected Charset detectCharsetImpl(byte[] buffer) throws Exception
|
|
{
|
|
CharsetDetector detector = new CharsetDetector();
|
|
detector.setText(buffer);
|
|
CharsetMatch match = detector.detect();
|
|
|
|
if(match != null && match.getConfidence() > threshold)
|
|
{
|
|
try
|
|
{
|
|
return Charset.forName(match.getName());
|
|
}
|
|
catch(UnsupportedCharsetException e)
|
|
{
|
|
logger.info("Charset detected as " + match.getName() + " but the JVM does not support this, detection skipped");
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Return the matching threshold before we decide that
|
|
* what we detected is a good match. In the range
|
|
* 0-100.
|
|
*/
|
|
public int getThreshold()
|
|
{
|
|
return threshold;
|
|
}
|
|
|
|
/**
|
|
* At what point do we decide our match is good enough?
|
|
* In the range 0-100. If we don't reach the threshold,
|
|
* we'll decline, and either another finder will work on
|
|
* it or the fallback encoding will be taken.
|
|
*/
|
|
public void setThreshold(int threshold)
|
|
{
|
|
if(threshold < 0)
|
|
threshold = 0;
|
|
if(threshold > 100)
|
|
threshold = 100;
|
|
|
|
this.threshold = threshold;
|
|
}
|
|
|
|
}
|