Move lucene analysis into the DataModel project

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20975 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Andrew Hind
2010-07-07 11:02:47 +00:00
parent fa91c077e6
commit 8105f39e33
46 changed files with 0 additions and 3570 deletions

View File

@@ -1,38 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene;
public enum AnalysisMode
{
DEFAULT
,
TOKENISE
,
IDENTIFIER
,
FUZZY
,
PREFIX
,
WILD
,
LIKE
;
}

View File

@@ -1,328 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.dictionary.IndexTokenisationMode;
import org.alfresco.repo.search.MLAnalysisMode;
import org.alfresco.repo.search.impl.lucene.analysis.AlfrescoStandardAnalyser;
import org.alfresco.repo.search.impl.lucene.analysis.LongAnalyser;
import org.alfresco.repo.search.impl.lucene.analysis.MLAnalayser;
import org.alfresco.repo.search.impl.lucene.analysis.PathAnalyser;
import org.alfresco.repo.search.impl.lucene.analysis.VerbatimAnalyser;
import org.alfresco.repo.search.impl.lucene.analysis.VerbatimMLAnalayser;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
import org.alfresco.service.cmr.dictionary.DictionaryService;
import org.alfresco.service.cmr.dictionary.PropertyDefinition;
import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
/**
* Analyse properties according to the property definition. The default is to use the standard tokeniser. The tokeniser
* should not have been called when indexing properties that require no tokenisation. (tokenise should be set to false
* when adding the field to the document)
*
* @author andyh
*/
public class LuceneAnalyser extends Analyzer
{
private static Log s_logger = LogFactory.getLog(LuceneAnalyser.class);
// Dictinary service to look up analyser classes by data type and locale.
private DictionaryService dictionaryService;
// If all else fails a fall back analyser
private Analyzer defaultAnalyser;
// Cached analysers for non ML data types.
private Map<String, Analyzer> analysers = new HashMap<String, Analyzer>();
private MLAnalysisMode mlAlaysisMode;
/**
* Constructs with a default standard analyser
*
* @param defaultAnalyzer
* Any fields not specifically defined to use a different analyzer will use the one provided here.
*/
public LuceneAnalyser(DictionaryService dictionaryService, MLAnalysisMode mlAlaysisMode)
{
this(new AlfrescoStandardAnalyser());
this.dictionaryService = dictionaryService;
this.mlAlaysisMode = mlAlaysisMode;
}
/**
* Constructs with default analyzer.
*
* @param defaultAnalyzer
* Any fields not specifically defined to use a different analyzer will use the one provided here.
*/
public LuceneAnalyser(Analyzer defaultAnalyser)
{
this.defaultAnalyser = defaultAnalyser;
}
public TokenStream tokenStream(String fieldName, Reader reader, AnalysisMode analysisMode)
{
Analyzer analyser = (Analyzer) analysers.get(fieldName);
if (analyser == null)
{
analyser = findAnalyser(fieldName, analysisMode);
}
return analyser.tokenStream(fieldName, reader);
}
public TokenStream tokenStream(String fieldName, Reader reader)
{
return tokenStream(fieldName, reader, AnalysisMode.DEFAULT);
}
/**
* Pick the analyser from the field name
*
* @param fieldName
* @return
*/
private Analyzer findAnalyser(String fieldName, AnalysisMode analysisMode)
{
Analyzer analyser;
if (fieldName.equals("PATH"))
{
analyser = new PathAnalyser();
}
else if (fieldName.equals("QNAME"))
{
analyser = new PathAnalyser();
}
else if (fieldName.equals("PRIMARYASSOCTYPEQNAME"))
{
analyser = new PathAnalyser();
}
else if (fieldName.equals("ASSOCTYPEQNAME"))
{
analyser = new PathAnalyser();
}
else if (fieldName.equals("TYPE"))
{
throw new UnsupportedOperationException("TYPE must not be tokenised");
}
else if (fieldName.equals("ASPECT"))
{
throw new UnsupportedOperationException("ASPECT must not be tokenised");
}
else if (fieldName.equals("ANCESTOR"))
{
analyser = new WhitespaceAnalyzer();
}
else if (fieldName.startsWith("@"))
{
if (fieldName.endsWith(".mimetype"))
{
analyser = new VerbatimAnalyser();
}
else if (fieldName.endsWith(".size"))
{
analyser = new LongAnalyser();
}
else if (fieldName.endsWith(".locale"))
{
analyser = new VerbatimAnalyser(true);
}
else
{
QName propertyQName = QName.createQName(fieldName.substring(1));
// Temporary fix for person and user uids
if (propertyQName.equals(ContentModel.PROP_USER_USERNAME)
|| propertyQName.equals(ContentModel.PROP_USERNAME) || propertyQName.equals(ContentModel.PROP_AUTHORITY_NAME))
{
analyser = new VerbatimAnalyser(true);
}
else
{
PropertyDefinition propertyDef = dictionaryService.getProperty(propertyQName);
IndexTokenisationMode tokenise = IndexTokenisationMode.TRUE;
if (propertyDef != null)
{
DataTypeDefinition dataType = propertyDef.getDataType();
tokenise = propertyDef.getIndexTokenisationMode();
if (tokenise == null)
{
tokenise = IndexTokenisationMode.TRUE;
}
switch (tokenise)
{
case TRUE:
if (dataType.getName().equals(DataTypeDefinition.CONTENT))
{
analyser = new MLAnalayser(dictionaryService, MLAnalysisMode.ALL_ONLY);
}
else if (dataType.getName().equals(DataTypeDefinition.TEXT))
{
analyser = new MLAnalayser(dictionaryService, MLAnalysisMode.ALL_ONLY);
}
else if (dataType.getName().equals(DataTypeDefinition.MLTEXT))
{
analyser = new MLAnalayser(dictionaryService, mlAlaysisMode);
}
else
{
analyser = loadAnalyzer(dataType);
}
break;
case BOTH:
switch (analysisMode)
{
case DEFAULT:
case TOKENISE:
if (dataType.getName().equals(DataTypeDefinition.CONTENT))
{
analyser = new MLAnalayser(dictionaryService, MLAnalysisMode.ALL_ONLY);
}
else if (dataType.getName().equals(DataTypeDefinition.TEXT))
{
analyser = new MLAnalayser(dictionaryService, MLAnalysisMode.ALL_ONLY);
}
else if (dataType.getName().equals(DataTypeDefinition.MLTEXT))
{
analyser = new MLAnalayser(dictionaryService, mlAlaysisMode);
}
else
{
analyser = loadAnalyzer(dataType);
}
break;
case IDENTIFIER:
if (dataType.getName().equals(DataTypeDefinition.MLTEXT))
{
analyser = new VerbatimMLAnalayser(mlAlaysisMode);
}
else
{
analyser = new VerbatimAnalyser();
}
break;
default:
throw new UnsupportedOperationException("TYPE must not be tokenised");
}
break;
case FALSE:
// TODO: MLText verbatim analyser
analyser = new VerbatimAnalyser();
break;
default:
throw new UnsupportedOperationException("TYPE must not be tokenised");
}
}
else
{
switch (analysisMode)
{
case IDENTIFIER:
analyser = new VerbatimAnalyser();
break;
case DEFAULT:
case TOKENISE:
DataTypeDefinition dataType = dictionaryService.getDataType(DataTypeDefinition.TEXT);
analyser = loadAnalyzer(dataType);
break;
default:
throw new UnsupportedOperationException();
}
}
}
}
}
else
{
analyser = defaultAnalyser;
}
analysers.put(fieldName, analyser);
return analyser;
}
/**
* Find an instantiate an analyser. The shuld all be thread sade as Analyser.tokenStream should be re-entrant.
*
* @param dataType
* @return
*/
private Analyzer loadAnalyzer(DataTypeDefinition dataType)
{
String analyserClassName = dataType.getAnalyserClassName().trim();
try
{
Class<?> clazz = Class.forName(analyserClassName);
Analyzer analyser = (Analyzer) clazz.newInstance();
if (s_logger.isDebugEnabled())
{
s_logger.debug("Loaded " + analyserClassName + " for type " + dataType.getName());
}
return analyser;
}
catch (ClassNotFoundException e)
{
throw new RuntimeException("Unable to load analyser for property of type " + dataType.getName() + " using " + analyserClassName);
}
catch (InstantiationException e)
{
throw new RuntimeException("Unable to load analyser for property of type " + dataType.getName() + " using " + analyserClassName);
}
catch (IllegalAccessException e)
{
throw new RuntimeException("Unable to load analyser for property of type " + dataType.getName() + " using " + analyserClassName);
}
}
/**
* For multilingual fields we separate the tokens for each instance to break phrase queries spanning different
* languages etc.
*/
@Override
public int getPositionIncrementGap(String fieldName)
{
if (fieldName.startsWith("@") && !fieldName.endsWith(".mimetype"))
{
QName propertyQName = QName.createQName(fieldName.substring(1));
PropertyDefinition propertyDef = dictionaryService.getProperty(propertyQName);
if (propertyDef != null)
{
if (propertyDef.getDataType().getName().equals(DataTypeDefinition.MLTEXT))
{
return 1000;
}
}
}
return super.getPositionIncrementGap(fieldName);
}
}

View File

@@ -1,67 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ISOLatin1AccentFilter;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class AlfrescoStandardAnalyser extends Analyzer
{
private Set stopSet;
/**
* An array containing some common English words that are usually not useful for searching.
*/
public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
/** Builds an analyzer. */
public AlfrescoStandardAnalyser()
{
this(STOP_WORDS);
}
/** Builds an analyzer with the given stop words. */
public AlfrescoStandardAnalyser(String[] stopWords)
{
stopSet = StopFilter.makeStopSet(stopWords);
}
/**
* Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
*/
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = new StandardTokenizer(reader);
result = new AlfrescoStandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet);
result = new ISOLatin1AccentFilter(result);
return result;
}
}

View File

@@ -1,140 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.util.LinkedList;
import java.util.Queue;
import java.util.StringTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class AlfrescoStandardFilter extends TokenFilter
{
/** Construct filtering <i>in</i>. */
public AlfrescoStandardFilter(TokenStream in)
{
super(in);
}
private static final String APOSTROPHE_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.APOSTROPHE];
private static final String ACRONYM_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ACRONYM];
private static final String HOST_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST];
private static final String ALPHANUM_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
private Queue<org.apache.lucene.analysis.Token> hostTokens = null;
/**
* Returns the next token in the stream, or null at EOS.
* <p>
* Removes <tt>'s</tt> from the end of words.
* <p>
* Removes dots from acronyms.
* <p>
* Splits host names ...
*/
public final org.apache.lucene.analysis.Token next() throws java.io.IOException
{
if (hostTokens == null)
{
org.apache.lucene.analysis.Token t = input.next();
if (t == null)
return null;
String text = t.termText();
String type = t.type();
if (type == APOSTROPHE_TYPE && // remove 's
(text.endsWith("'s") || text.endsWith("'S")))
{
return new org.apache.lucene.analysis.Token(text.substring(0, text.length() - 2), t.startOffset(), t
.endOffset(), type);
}
else if (type == ACRONYM_TYPE)
{ // remove dots
StringBuffer trimmed = new StringBuffer();
for (int i = 0; i < text.length(); i++)
{
char c = text.charAt(i);
if (c != '.')
trimmed.append(c);
}
return new org.apache.lucene.analysis.Token(trimmed.toString(), t.startOffset(), t.endOffset(), type);
}
else if (type == HOST_TYPE)
{
// <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
// There must be at least two tokens ....
hostTokens = new LinkedList<org.apache.lucene.analysis.Token>();
StringTokenizer tokeniser = new StringTokenizer(text, ".");
int start = t.startOffset();
int end;
while (tokeniser.hasMoreTokens())
{
String token = tokeniser.nextToken();
end = start + token.length();
hostTokens.offer(new org.apache.lucene.analysis.Token(token, start, end, ALPHANUM_TYPE));
start = end + 1;
}
// check if we have an acronym ..... yes a.b.c ends up here ...
if (text.length() == hostTokens.size() * 2 - 1)
{
hostTokens = null;
// acronym
StringBuffer trimmed = new StringBuffer();
for (int i = 0; i < text.length(); i++)
{
char c = text.charAt(i);
if (c != '.')
trimmed.append(c);
}
return new org.apache.lucene.analysis.Token(trimmed.toString(), t.startOffset(), t.endOffset(),
ALPHANUM_TYPE);
}
else
{
return hostTokens.remove();
}
}
else
{
return t;
}
}
else
{
org.apache.lucene.analysis.Token token = hostTokens.remove();
if (hostTokens.isEmpty())
{
hostTokens = null;
}
return token;
}
}
}

View File

@@ -1,55 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.alfresco.error.AlfrescoRuntimeException;
public class AnalysisException extends AlfrescoRuntimeException
{
/**
*
*/
private static final long serialVersionUID = -7722380192490118459L;
public AnalysisException(String msgId)
{
super(msgId);
// TODO Auto-generated constructor stub
}
public AnalysisException(String msgId, Object[] msgParams)
{
super(msgId, msgParams);
// TODO Auto-generated constructor stub
}
public AnalysisException(String msgId, Throwable cause)
{
super(msgId, cause);
// TODO Auto-generated constructor stub
}
public AnalysisException(String msgId, Object[] msgParams, Throwable cause)
{
super(msgId, msgParams, cause);
// TODO Auto-generated constructor stub
}
}

View File

@@ -1,46 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
/**
* @author andyh
*
* TODO To change the template for this generated type comment go to Window -
* Preferences - Java - Code Style - Code Templates
*/
public class CategoryAnalyser extends Analyzer
{
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String,
* java.io.Reader)
*/
public TokenStream tokenStream(String fieldName, Reader reader)
{
return new PathTokenFilter(reader, PathTokenFilter.PATH_SEPARATOR,
PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, false);
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class DanishSnowballAnalyser extends SnowballAnalyzer
{
public DanishSnowballAnalyser()
{
super("Danish");
}
}

View File

@@ -1,39 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
public class DateAnalyser extends Analyzer
{
public DateAnalyser()
{
super();
}
// Split at the T in the XML date form
public TokenStream tokenStream(String fieldName, Reader reader)
{
return new DateTokenFilter(reader);
}
}

View File

@@ -1,39 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
public class DateTimeAnalyser extends Analyzer
{
public DateTimeAnalyser()
{
super();
}
// Split at the T in the XML date form
public TokenStream tokenStream(String fieldName, Reader reader)
{
return new DateTimeTokenFilter(reader);
}
}

View File

@@ -1,191 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.Iterator;
import org.alfresco.util.CachingDateFormat;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* @author andyh
*/
public class DateTimeTokenFilter extends Tokenizer
{
Tokenizer baseTokeniser;
Iterator<Token> tokenIterator = null;
public DateTimeTokenFilter(Reader in)
{
super(in);
baseTokeniser = new WhitespaceTokenizer(in);
}
public Token next() throws IOException
{
if (tokenIterator == null)
{
buildIterator();
}
if (tokenIterator.hasNext())
{
return tokenIterator.next();
}
else
{
return null;
}
}
public void buildIterator() throws IOException
{
Token candidate;
ArrayList<Token> tokens = new ArrayList<Token>();
while ((candidate = baseTokeniser.next()) != null)
{
Date date;
if (candidate.termText().equalsIgnoreCase("now"))
{
date = new Date();
}
else if (candidate.termText().equalsIgnoreCase("today"))
{
date = new Date();
Calendar cal = Calendar.getInstance();
cal.setTime(date);
cal.set(Calendar.HOUR_OF_DAY, cal.getMinimum(Calendar.HOUR_OF_DAY));
cal.set(Calendar.MINUTE, cal.getMinimum(Calendar.MINUTE));
cal.set(Calendar.SECOND, cal.getMinimum(Calendar.SECOND));
cal.set(Calendar.MILLISECOND, cal.getMinimum(Calendar.MILLISECOND));
}
else
{
try
{
date = CachingDateFormat.lenientParse(candidate.termText());
}
catch (ParseException e)
{
continue;
}
}
Calendar cal = Calendar.getInstance();
cal.setTime(date);
Token token;
// four digits
token = new Token("YE" + cal.get(Calendar.YEAR), candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
// 2 digits
int month = cal.get(Calendar.MONTH);
if (month < 10)
{
token = new Token("MO0" + month, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
else
{
token = new Token("MO" + month, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
int day = cal.get(Calendar.DAY_OF_MONTH);
if (day < 10)
{
token = new Token("DA0" + day, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
else
{
token = new Token("DA" + day, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
int hour = cal.get(Calendar.HOUR_OF_DAY);
if (hour < 10)
{
token = new Token("HO0" + hour, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
else
{
token = new Token("HO" + hour, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
int minute = cal.get(Calendar.MINUTE);
if (minute < 10)
{
token = new Token("MI0" + minute, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
else
{
token = new Token("MI" + minute, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
int second = cal.get(Calendar.SECOND);
if (second < 10)
{
token = new Token("SE0" + second, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
else
{
token = new Token("SE" + second, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
int millis = cal.get(Calendar.MILLISECOND);
if (millis < 10)
{
token = new Token("MS00" + millis, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
else if (millis < 100)
{
token = new Token("MS0" + millis, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
else
{
token = new Token("MS" + millis, candidate.startOffset(), candidate.startOffset(), candidate.type());
tokens.add(token);
}
break;
}
tokenIterator = tokens.iterator();
}
}

View File

@@ -1,85 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import org.alfresco.util.CachingDateFormat;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* @author andyh
*/
public class DateTokenFilter extends Tokenizer
{
Tokenizer baseTokeniser;
public DateTokenFilter(Reader in)
{
super(in);
baseTokeniser = new WhitespaceTokenizer(in);
}
public Token next() throws IOException
{
SimpleDateFormat dof = CachingDateFormat.getDateOnlyFormat();
Token candidate;
while ((candidate = baseTokeniser.next()) != null)
{
Date date;
if (candidate.termText().equalsIgnoreCase("now"))
{
date = new Date();
}
else if (candidate.termText().equalsIgnoreCase("today"))
{
date = new Date();
Calendar cal = Calendar.getInstance();
cal.setTime(date);
cal.set(Calendar.HOUR_OF_DAY, cal.getMinimum(Calendar.HOUR_OF_DAY));
cal.set(Calendar.MINUTE, cal.getMinimum(Calendar.MINUTE));
cal.set(Calendar.SECOND, cal.getMinimum(Calendar.SECOND));
cal.set(Calendar.MILLISECOND, cal.getMinimum(Calendar.MILLISECOND));
}
else
{
try
{
date = CachingDateFormat.lenientParse(candidate.termText());
}
catch (ParseException e)
{
continue;
}
}
String valueString = dof.format(date);
Token integerToken = new Token(valueString, candidate.startOffset(), candidate.startOffset(), candidate.type());
return integerToken;
}
return null;
}
}

View File

@@ -1,44 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
/**
* Simple analyser to wrap the tokenisation of doubles.
*
* @author Andy Hind
*/
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
public class DoubleAnalyser extends Analyzer
{
public DoubleAnalyser()
{
super();
}
public TokenStream tokenStream(String fieldName, Reader reader)
{
return new DoubleTokenFilter(reader);
}
}

View File

@@ -1,69 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tokeniser for doubles.
*
* @author Andy Hind
*/
public class DoubleTokenFilter extends Tokenizer
{
Tokenizer baseTokeniser;
public DoubleTokenFilter(Reader in)
{
super(in);
baseTokeniser = new WhitespaceTokenizer(in);
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next() throws IOException
{
Token candidate;
while((candidate = baseTokeniser.next()) != null)
{
try
{
Double d = Double.valueOf(candidate.termText());
String valueString = NumericEncoder.encode(d.doubleValue());
Token doubleToken = new Token(valueString, candidate.startOffset(), candidate.startOffset(),
candidate.type());
return doubleToken;
}
catch (NumberFormatException e)
{
// just ignore and try the next one
}
}
return null;
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class DutchSnowballAnalyser extends SnowballAnalyzer
{
public DutchSnowballAnalyser()
{
super("Dutch");
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class EnglishSnowballAnalyser extends SnowballAnalyzer
{
public EnglishSnowballAnalyser()
{
super("English");
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class FinnishSnowballAnalyser extends SnowballAnalyzer
{
public FinnishSnowballAnalyser()
{
super("Finnish");
}
}

View File

@@ -1,43 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
/**
* Simple analyser for floats.
*
* @author Andy Hind
*/
public class FloatAnalyser extends Analyzer
{
public FloatAnalyser()
{
super();
}
public TokenStream tokenStream(String fieldName, Reader reader)
{
return new FloatTokenFilter(reader);
}
}

View File

@@ -1,69 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tokeniser for floats.
*
* @author Andy Hind
*/
public class FloatTokenFilter extends Tokenizer
{
Tokenizer baseTokeniser;
public FloatTokenFilter(Reader in)
{
super(in);
baseTokeniser = new WhitespaceTokenizer(in);
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next() throws IOException
{
Token candidate;
while((candidate = baseTokeniser.next()) != null)
{
try
{
Float floatValue = Float.valueOf(candidate.termText());
String valueString = NumericEncoder.encode(floatValue.floatValue());
Token floatToken = new Token(valueString, candidate.startOffset(), candidate.startOffset(),
candidate.type());
return floatToken;
}
catch (NumberFormatException e)
{
// just ignore and try the next one
}
}
return null;
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class FrenchSnowballAnalyser extends SnowballAnalyzer
{
public FrenchSnowballAnalyser()
{
super("French");
}
}

View File

@@ -1,43 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ISOLatin1AccentFilter;
import org.apache.lucene.analysis.TokenStream;
public class FrenchSnowballAnalyserThatRemovesAccents extends Analyzer
{
Analyzer analyzer = new FrenchSnowballAnalyser();
public FrenchSnowballAnalyserThatRemovesAccents()
{
}
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = analyzer.tokenStream(fieldName, reader);
result = new ISOLatin1AccentFilter(result);
return result;
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class German2SnowballAnalyser extends SnowballAnalyzer
{
public German2SnowballAnalyser()
{
super("German2");
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class GermanSnowballAnalyser extends SnowballAnalyzer
{
public GermanSnowballAnalyser()
{
super("German");
}
}

View File

@@ -1,43 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
/**
* Simple analyser for integers.
*
* @author Andy Hind
*/
public class IntegerAnalyser extends Analyzer
{
public IntegerAnalyser()
{
super();
}
public TokenStream tokenStream(String fieldName, Reader reader)
{
return new IntegerTokenFilter(reader);
}
}

View File

@@ -1,69 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tokeniser for integers.
*
* @author Andy Hind
*/
public class IntegerTokenFilter extends Tokenizer
{
Tokenizer baseTokeniser;
public IntegerTokenFilter(Reader in)
{
super(in);
baseTokeniser = new WhitespaceTokenizer(in);
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next() throws IOException
{
Token candidate;
while((candidate = baseTokeniser.next()) != null)
{
try
{
Integer integer = Integer.valueOf(candidate.termText());
String valueString = NumericEncoder.encode(integer.intValue());
Token integerToken = new Token(valueString, candidate.startOffset(), candidate.startOffset(),
candidate.type());
return integerToken;
}
catch (NumberFormatException e)
{
// just ignore and try the next one
}
}
return null;
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class ItalianSnowballAnalyser extends SnowballAnalyzer
{
public ItalianSnowballAnalyser()
{
super("Italian");
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class KPSnowballAnalyser extends SnowballAnalyzer
{
public KPSnowballAnalyser()
{
super("Kp");
}
}

View File

@@ -1,44 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
/**
* Simple analyser for longs.
*
* @author Andy Hind
*/
public class LongAnalyser extends Analyzer
{
public LongAnalyser()
{
super();
}
public TokenStream tokenStream(String fieldName, Reader reader)
{
return new LongTokenFilter(reader);
}
}

View File

@@ -1,69 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tokeniser for longs.
*
* @author Andy Hind
*/
public class LongTokenFilter extends Tokenizer
{
Tokenizer baseTokeniser;
public LongTokenFilter(Reader in)
{
super(in);
baseTokeniser = new WhitespaceTokenizer(in);
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next() throws IOException
{
Token candidate;
while((candidate = baseTokeniser.next()) != null)
{
try
{
Long longValue = Long.valueOf(candidate.termText());
String valueString = NumericEncoder.encode(longValue.longValue());
Token longToken = new Token(valueString, candidate.startOffset(), candidate.startOffset(),
candidate.type());
return longToken;
}
catch (NumberFormatException e)
{
// just ignore and try the next one
}
}
return null;
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class LovinsSnowballAnalyser extends SnowballAnalyzer
{
public LovinsSnowballAnalyser()
{
super("Lovins");
}
}

View File

@@ -1,27 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
public class LowerCaseVerbatimAnalyser extends VerbatimAnalyser
{
public LowerCaseVerbatimAnalyser()
{
super(true);
}
}

View File

@@ -1,213 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Locale;
import org.springframework.extensions.surf.util.I18NUtil;
import org.alfresco.repo.search.MLAnalysisMode;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
import org.alfresco.service.cmr.dictionary.DictionaryService;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
public class MLAnalayser extends Analyzer
{
private static Log s_logger = LogFactory.getLog(MLAnalayser.class);
private DictionaryService dictionaryService;
private HashMap<Locale, Analyzer> analysers = new HashMap<Locale, Analyzer>();
private MLAnalysisMode mlAnalaysisMode;
public MLAnalayser(DictionaryService dictionaryService, MLAnalysisMode mlAnalaysisMode)
{
this.dictionaryService = dictionaryService;
this.mlAnalaysisMode = mlAnalaysisMode;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader)
{
// We use read ahead to get the language info - if this does not exist we need to restart
// an use the default - there foer we need mark and restore.
if (!(reader instanceof BufferedReader))
{
BufferedReader breader = new BufferedReader(reader);
try
{
if (!breader.markSupported())
{
throw new AnalysisException(
"Multilingual tokenisation requires a reader that supports marks and reset");
}
breader.mark(100);
StringBuilder builder = new StringBuilder();
if (breader.read() == '\u0000')
{
String language = "";
String country = "";
String varient = "";
char c;
int count = 0;
while ((c = (char) breader.read()) != '\u0000')
{
if (count++ > 99)
{
breader.reset();
return getDefaultAnalyser().tokenStream(fieldName, breader);
}
if (c == '_')
{
if (language.length() == 0)
{
language = builder.toString();
}
else if (country.length() == 0)
{
country = builder.toString();
}
else if (varient.length() == 0)
{
varient = builder.toString();
}
else
{
breader.reset();
return getDefaultAnalyser().tokenStream(fieldName, breader);
}
builder = new StringBuilder();
}
else
{
builder.append(c);
}
}
if (builder.length() > 0)
{
if (language.length() == 0)
{
language = builder.toString();
}
else if (country.length() == 0)
{
country = builder.toString();
}
else if (varient.length() == 0)
{
varient = builder.toString();
}
else
{
breader.reset();
return getDefaultAnalyser().tokenStream(fieldName, breader);
}
}
Locale locale = new Locale(language, country, varient);
// leave the reader where it is ....
return new MLTokenDuplicator(getAnalyser(locale).tokenStream(fieldName, breader), locale, breader, mlAnalaysisMode);
}
else
{
breader.reset();
return getDefaultAnalyser().tokenStream(fieldName, breader);
}
}
catch (IOException io)
{
try
{
breader.reset();
}
catch (IOException e)
{
throw new AnalysisException("Failed to reset buffered reader - token stream will be invalid", e);
}
return getDefaultAnalyser().tokenStream(fieldName, breader);
}
}
else
{
throw new AnalysisException("Multilingual tokenisation requires a buffered reader");
}
}
private Analyzer getDefaultAnalyser()
{
return getAnalyser(I18NUtil.getLocale());
}
private Analyzer getAnalyser(Locale locale)
{
Analyzer analyser = (Analyzer) analysers.get(locale);
if (analyser == null)
{
analyser = findAnalyser(locale);
}
// wrap analyser to produce plain and prefixed tokens
return analyser;
}
private Analyzer findAnalyser(Locale locale)
{
Analyzer analyser = loadAnalyzer(locale);
analysers.put(locale, analyser);
return analyser;
}
private Analyzer loadAnalyzer(Locale locale)
{
DataTypeDefinition dataType = dictionaryService.getDataType(DataTypeDefinition.TEXT);
String analyserClassName = dataType.getAnalyserClassName(locale);
if (s_logger.isDebugEnabled())
{
s_logger.debug("Loading " + analyserClassName + " for " + locale);
}
try
{
Class<?> clazz = Class.forName(analyserClassName);
Analyzer analyser = (Analyzer) clazz.newInstance();
return analyser;
}
catch (ClassNotFoundException e)
{
throw new RuntimeException("Unable to load analyser for property of type "
+ dataType.getName() + " using " + analyserClassName);
}
catch (InstantiationException e)
{
throw new RuntimeException("Unable to load analyser for property of type "
+ dataType.getName() + " using " + analyserClassName);
}
catch (IllegalAccessException e)
{
throw new RuntimeException("Unable to load analyser for property of type "
+ dataType.getName() + " using " + analyserClassName);
}
}
}

View File

@@ -1,148 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import org.alfresco.repo.search.MLAnalysisMode;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
* Create duplicate tokens for multilingual varients The forms are Tokens: Token - all languages {fr}Token - if a
* language is specified {fr_CA}Token - if a language and country is specified {fr_CA_Varient}Token - for all three
* {fr__Varient}Token - for a language varient with no country
*
* @author andyh
*/
public class MLTokenDuplicator extends Tokenizer
{
private static Log s_logger = LogFactory.getLog(MLTokenDuplicator.class);
TokenStream source;
Locale locale;
Iterator<Token> it;
HashSet<String> prefixes;
public MLTokenDuplicator(TokenStream source, Locale locale, Reader reader, MLAnalysisMode mlAnalaysisMode)
{
super(reader);
this.source = source;
this.locale = locale;
Collection<Locale> locales = MLAnalysisMode.getLocales(mlAnalaysisMode, locale, false);
prefixes = new HashSet<String>(locales.size());
for(Locale toAdd : locales)
{
String localeString = toAdd.toString();
if(localeString.length() == 0)
{
prefixes.add("");
}
else
{
StringBuilder builder = new StringBuilder(16);
builder.append("{").append(localeString).append("}");
prefixes.add(builder.toString());
}
}
if(s_logger.isDebugEnabled())
{
s_logger.debug("Locale "+ locale +" using "+mlAnalaysisMode+" is "+prefixes);
}
}
public MLTokenDuplicator(Locale locale, MLAnalysisMode mlAnalaysisMode)
{
this(null, locale, null, mlAnalaysisMode);
}
@Override
public Token next() throws IOException
{
Token t = null;
if (it == null)
{
it = buildIterator();
}
if (it == null)
{
return null;
}
if (it.hasNext())
{
t = it.next();
return t;
}
else
{
it = null;
t = this.next();
return t;
}
}
private Iterator<Token> buildIterator() throws IOException
{
Token token = source.next();
return buildIterator(token);
}
public Iterator<Token> buildIterator(Token token)
{
if (token == null)
{
return null;
}
ArrayList<Token> tokens = new ArrayList<Token>(prefixes.size());
for (String prefix : prefixes)
{
Token newToken = new Token(prefix + token.termText(), token.startOffset(), token.endOffset(), token.type());
if (tokens.size() == 0)
{
newToken.setPositionIncrement(token.getPositionIncrement());
}
else
{
newToken.setPositionIncrement(0);
}
tokens.add(newToken);
}
return tokens.iterator();
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class NorwegianSnowballAnalyser extends SnowballAnalyzer
{
public NorwegianSnowballAnalyser()
{
super("Norwegian");
}
}

View File

@@ -1,228 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
/**
* Support to encode numeric types in the lucene index.
*
* To support range queries in the lucene index numeric types need to be indexed
* specially. This has been addressed for int and long types for lucene and
* limited support (via scaling) for float and double.
*
* The implementation splits an int, long, float or double into the sign bit,
* optional exponent and mantissa either from the int or long format or its IEEE
* 754 byte representation.
*
* To index content so small negative numbers are indexed correctly and are
* after big negative numbers in range queries.
*
* The algorithm finds the sign, if the number is negative, then the mantissa
* and exponent are XORed against the appropriate masks. This reverses the
* order. As negative numbers appear first in the list their sign bit is 0 and
* positive numbers are 1.
*
* @author Andy Hind
*/
public class NumericEncoder
{
/*
* Constants for integer encoding
*/
static int INTEGER_SIGN_MASK = 0x80000000;
/*
* Constants for long encoding
*/
static long LONG_SIGN_MASK = 0x8000000000000000L;
/*
* Constants for float encoding
*/
static int FLOAT_SIGN_MASK = 0x80000000;
static int FLOAT_EXPONENT_MASK = 0x7F800000;
static int FLOAT_MANTISSA_MASK = 0x007FFFFF;
/*
* Constants for double encoding
*/
static long DOUBLE_SIGN_MASK = 0x8000000000000000L;
static long DOUBLE_EXPONENT_MASK = 0x7FF0000000000000L;
static long DOUBLE_MANTISSA_MASK = 0x000FFFFFFFFFFFFFL;
private NumericEncoder()
{
super();
}
/**
* Encode an integer into a string that orders correctly using string
* comparison Integer.MIN_VALUE encodes as 00000000 and MAX_VALUE as
* ffffffff.
*
* @param intToEncode
* @return the encoded string
*/
public static String encode(int intToEncode)
{
int replacement = intToEncode ^ INTEGER_SIGN_MASK;
return encodeToHex(replacement);
}
/**
* Encode a long into a string that orders correctly using string comparison
* Long.MIN_VALUE encodes as 0000000000000000 and MAX_VALUE as
* ffffffffffffffff.
*
* @param longToEncode
* @return - the encoded string
*/
public static String encode(long longToEncode)
{
long replacement = longToEncode ^ LONG_SIGN_MASK;
return encodeToHex(replacement);
}
/**
* Secode a long
* @param hex
* @return - the decoded string
*/
public static long decodeLong(String hex)
{
return decodeFromHex(hex) ^ LONG_SIGN_MASK;
}
public static int decodeInt(String hex)
{
return decodeIntFromHex(hex) ^ INTEGER_SIGN_MASK;
}
/**
* Encode a float into a string that orders correctly according to string
* comparison. Note that there is no negative NaN but there are codings that
* imply this. So NaN and -Infinity may not compare as expected.
*
* @param floatToEncode
* @return - the encoded string
*/
public static String encode(float floatToEncode)
{
int bits = Float.floatToIntBits(floatToEncode);
int sign = bits & FLOAT_SIGN_MASK;
int exponent = bits & FLOAT_EXPONENT_MASK;
int mantissa = bits & FLOAT_MANTISSA_MASK;
if (sign != 0)
{
exponent ^= FLOAT_EXPONENT_MASK;
mantissa ^= FLOAT_MANTISSA_MASK;
}
sign ^= FLOAT_SIGN_MASK;
int replacement = sign | exponent | mantissa;
return encodeToHex(replacement);
}
/**
* Encode a double into a string that orders correctly according to string
* comparison. Note that there is no negative NaN but there are codings that
* imply this. So NaN and -Infinity may not compare as expected.
*
* @param doubleToEncode
* @return the encoded string
*/
public static String encode(double doubleToEncode)
{
long bits = Double.doubleToLongBits(doubleToEncode);
long sign = bits & DOUBLE_SIGN_MASK;
long exponent = bits & DOUBLE_EXPONENT_MASK;
long mantissa = bits & DOUBLE_MANTISSA_MASK;
if (sign != 0)
{
exponent ^= DOUBLE_EXPONENT_MASK;
mantissa ^= DOUBLE_MANTISSA_MASK;
}
sign ^= DOUBLE_SIGN_MASK;
long replacement = sign | exponent | mantissa;
return encodeToHex(replacement);
}
private static String encodeToHex(int i)
{
char[] buf = new char[] { '0', '0', '0', '0', '0', '0', '0', '0' };
int charPos = 8;
do
{
buf[--charPos] = DIGITS[i & MASK];
i >>>= 4;
}
while (i != 0);
return new String(buf);
}
private static String encodeToHex(long l)
{
char[] buf = new char[] { '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' };
int charPos = 16;
do
{
buf[--charPos] = DIGITS[(int) l & MASK];
l >>>= 4;
}
while (l != 0);
return new String(buf);
}
private static long decodeFromHex(String hex)
{
long l = 0;
long factor = 1;
for(int i = 15; i >= 0; i--, factor <<= 4)
{
int digit = Character.digit(hex.charAt(i), 16);
l += digit*factor;
}
return l;
}
private static int decodeIntFromHex(String hex)
{
int l = 0;
int factor = 1;
for(int i = 7; i >= 0; i--, factor <<= 4)
{
int digit = Character.digit(hex.charAt(i), 16);
l += digit*factor;
}
return l;
}
private static final char[] DIGITS = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e',
'f' };
private static final int MASK = (1 << 4) - 1;
}

View File

@@ -1,215 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import junit.framework.TestCase;
/**
* Tests for string encoding
* @author andyh
*
*/
public class NumericEncodingTest extends TestCase
{
/**
*
*
*/
public NumericEncodingTest()
{
super();
}
/**
*
* @param arg0
*/
public NumericEncodingTest(String arg0)
{
super(arg0);
}
/**
* Do an exhaustive test for integers
*
*/
public void xtestAllIntegerEncodings()
{
String lastString = null;
String nextString = null;
for (long i = Integer.MIN_VALUE; i <= Integer.MAX_VALUE; i++)
{
nextString = NumericEncoder.encode((int) i);
if (lastString != null)
{
assertFalse(lastString.compareTo(nextString) > 0);
}
lastString = nextString;
}
}
/**
* Do an exhaustive test for float
*
*/
public void xtestAllFloatEncodings()
{
Float last = null;
Float next = null;
String lastString = null;
String nextString = null;
for (int sign = 1; sign >= 0; sign--)
{
if (sign == 0)
{
for (int exponent = 0; exponent <= 0xFF; exponent++)
{
for (int mantissa = 0; mantissa <= 0x007FFFFF; mantissa++)
{
int bitPattern = sign << 31 | exponent << 23 | mantissa;
next = Float.intBitsToFloat(bitPattern);
if (!next.equals(Float.NaN) && (last != null) && (last.compareTo(next) > 0))
{
System.err.println(last + " > " + next);
}
if (!next.equals(Float.NaN))
{
nextString = NumericEncoder.encode(next);
if ((lastString != null) && (lastString.compareTo(nextString) > 0))
{
System.err.println(lastString + " > " + nextString);
}
lastString = nextString;
}
last = next;
}
}
}
else
{
for (int exponent = 0xFF; exponent >= 0; exponent--)
{
for (int mantissa = 0x007FFFFF; mantissa >= 0; mantissa--)
{
int bitPattern = sign << 31 | exponent << 23 | mantissa;
next = Float.intBitsToFloat(bitPattern);
if (!next.equals(Float.NaN) && (last != null) && (last.compareTo(next) > 0))
{
System.err.println(last + " > " + next);
}
if (!next.equals(Float.NaN))
{
nextString = NumericEncoder.encode(next);
if ((lastString != null) && (lastString.compareTo(nextString) > 0))
{
System.err.println(lastString + " > " + nextString);
}
lastString = nextString;
}
last = next;
}
}
}
}
}
/**
* Sample test for int
*/
public void testIntegerEncoding()
{
assertEquals("00000000", NumericEncoder.encode(Integer.MIN_VALUE));
assertEquals("00000001", NumericEncoder.encode(Integer.MIN_VALUE + 1));
assertEquals("7fffffff", NumericEncoder.encode(-1));
assertEquals("80000000", NumericEncoder.encode(0));
assertEquals("80000001", NumericEncoder.encode(1));
assertEquals("fffffffe", NumericEncoder.encode(Integer.MAX_VALUE - 1));
assertEquals("ffffffff", NumericEncoder.encode(Integer.MAX_VALUE));
}
/**
* Sample test for long
*/
public void testLongEncoding()
{
assertEquals("0000000000000000", NumericEncoder.encode(Long.MIN_VALUE));
assertEquals("0000000000000001", NumericEncoder.encode(Long.MIN_VALUE + 1));
assertEquals("7fffffffffffffff", NumericEncoder.encode(-1L));
assertEquals("8000000000000000", NumericEncoder.encode(0L));
assertEquals("8000000000000001", NumericEncoder.encode(1L));
assertEquals("fffffffffffffffe", NumericEncoder.encode(Long.MAX_VALUE - 1));
assertEquals("ffffffffffffffff", NumericEncoder.encode(Long.MAX_VALUE));
assertEquals(NumericEncoder.decodeLong(NumericEncoder.encode(Long.MIN_VALUE)), Long.MIN_VALUE);
assertEquals(NumericEncoder.decodeLong(NumericEncoder.encode(Long.MIN_VALUE + 1)),Long.MIN_VALUE + 1);
assertEquals(NumericEncoder.decodeLong(NumericEncoder.encode(-1L)), -1L);
assertEquals(NumericEncoder.decodeLong(NumericEncoder.encode(0L)), 0L);
assertEquals(NumericEncoder.decodeLong(NumericEncoder.encode(1L)), 1L);
assertEquals(NumericEncoder.decodeLong(NumericEncoder.encode(Long.MAX_VALUE - 1)),Long.MAX_VALUE - 1);
assertEquals(NumericEncoder.decodeLong(NumericEncoder.encode(Long.MAX_VALUE)), Long.MAX_VALUE);
}
/**
* Sample test for float
*/
public void testFloatEncoding()
{
assertEquals("007fffff", NumericEncoder.encode(Float.NEGATIVE_INFINITY));
assertEquals("00800000", NumericEncoder.encode(-Float.MAX_VALUE));
assertEquals("7ffffffe", NumericEncoder.encode(-Float.MIN_VALUE));
assertEquals("7fffffff", NumericEncoder.encode(-0f));
assertEquals("80000000", NumericEncoder.encode(0f));
assertEquals("80000001", NumericEncoder.encode(Float.MIN_VALUE));
assertEquals("ff7fffff", NumericEncoder.encode(Float.MAX_VALUE));
assertEquals("ff800000", NumericEncoder.encode(Float.POSITIVE_INFINITY));
assertEquals("ffc00000", NumericEncoder.encode(Float.NaN));
}
/**
* Sample test for double
*/
public void testDoubleEncoding()
{
assertEquals("000fffffffffffff", NumericEncoder.encode(Double.NEGATIVE_INFINITY));
assertEquals("0010000000000000", NumericEncoder.encode(-Double.MAX_VALUE));
assertEquals("7ffffffffffffffe", NumericEncoder.encode(-Double.MIN_VALUE));
assertEquals("7fffffffffffffff", NumericEncoder.encode(-0d));
assertEquals("8000000000000000", NumericEncoder.encode(0d));
assertEquals("8000000000000001", NumericEncoder.encode(Double.MIN_VALUE));
assertEquals("ffefffffffffffff", NumericEncoder.encode(Double.MAX_VALUE));
assertEquals("fff0000000000000", NumericEncoder.encode(Double.POSITIVE_INFINITY));
assertEquals("fff8000000000000", NumericEncoder.encode(Double.NaN));
assertTrue( NumericEncoder.encode(-0.9).compareTo(NumericEncoder.encode(0.88)) < 0);
assertTrue( NumericEncoder.encode(-0.9).compareTo(NumericEncoder.encode(0.91)) < 0);
assertTrue( NumericEncoder.encode(0.88).compareTo(NumericEncoder.encode(0.91)) < 0);
}
}

View File

@@ -1,39 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
/**
* Analyse repository paths
*
* @author andyh
*/
public class PathAnalyser extends Analyzer
{
public TokenStream tokenStream(String fieldName, Reader reader)
{
return new PathTokenFilter(reader, PathTokenFilter.PATH_SEPARATOR,
PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
}
}

View File

@@ -1,291 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Iterator;
import java.util.LinkedList;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
/**
* @author andyh TODO To change the template for this generated type comment go to Window - Preferences - Java - Code
* Style - Code Templates
*/
public class PathTokenFilter extends Tokenizer
{
public final static String INTEGER_FORMAT = "0000000000";
public final static char PATH_SEPARATOR = ';';
public final static char NAMESPACE_START_DELIMITER = '{';
public final static char NAMESPACE_END_DELIMITER = '}';
public final static String SEPARATOR_TOKEN_TEXT = ";";
public final static String NO_NS_TOKEN_TEXT = "<No Namespace>";
public final static String TOKEN_TYPE_PATH_SEP = "PATH_SEPARATOR";
public final static String TOKEN_TYPE_PATH_LENGTH = "PATH_LENGTH";
public final static String TOKEN_TYPE_PATH_ELEMENT_NAME = "PATH_ELEMENT_NAME";
public final static String TOKEN_TYPE_PATH_ELEMENT_NAMESPACE = "PATH_ELEMENT_NAMESPACE";
public final static String TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX = "PATH_ELEMENT_NAMESPACE_PREFIX";
char pathSeparator;
String separatorTokenText;
String noNsTokenText;
char nsStartDelimiter;
int nsStartDelimiterLength;
char nsEndDelimiter;
int nsEndDelimiterLength;
char nsPrefixDelimiter = ':';
LinkedList<Token> tokens = new LinkedList<Token>();
Iterator<Token> it = null;
private boolean includeNamespace;
public PathTokenFilter(Reader in, char pathSeparator, String separatorTokenText, String noNsTokenText,
char nsStartDelimiter, char nsEndDelimiter, boolean includeNameSpace)
{
super(in);
this.pathSeparator = pathSeparator;
this.separatorTokenText = separatorTokenText;
this.noNsTokenText = noNsTokenText;
this.nsStartDelimiter = nsStartDelimiter;
this.nsEndDelimiter = nsEndDelimiter;
this.includeNamespace = includeNameSpace;
this.nsStartDelimiterLength = 1;
this.nsEndDelimiterLength = 1;
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next() throws IOException
{
Token nextToken;
if (it == null)
{
buildTokenListAndIterator();
}
if (it.hasNext())
{
nextToken = it.next();
}
else
{
nextToken = null;
}
return nextToken;
}
private void buildTokenListAndIterator() throws IOException
{
NumberFormat nf = new DecimalFormat(INTEGER_FORMAT);
// Could optimise to read each path ata time - not just all paths
int insertCountAt = 0;
int lengthCounter = 0;
Token t;
Token pathSplitToken = null;
Token nameToken = null;
Token countToken = null;
Token namespaceToken = null;
while ((t = nextToken()) != null)
{
String text = t.termText();
if (text.length() == 0)
{
continue; // Skip if we find // or /; or ;; etc
}
if (text.charAt(text.length() - 1) == pathSeparator)
{
text = text.substring(0, text.length() - 1);
pathSplitToken = new Token(separatorTokenText, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP);
pathSplitToken.setPositionIncrement(1);
}
int split = -1;
boolean isPrefix = false;
if ((text.length() > 0) && (text.charAt(0) == nsStartDelimiter))
{
split = text.indexOf(nsEndDelimiter);
}
if (split == -1)
{
split = text.indexOf(nsPrefixDelimiter);
isPrefix = true;
}
if (split == -1)
{
namespaceToken = new Token(noNsTokenText, t.startOffset(), t.startOffset(),
TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
nameToken = new Token(text, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
}
else
{
if (isPrefix)
{
namespaceToken = new Token(text.substring(0, split), t.startOffset(), t.startOffset() + split,
TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX);
nameToken = new Token(text.substring(split + 1), t.startOffset() + split + 1, t.endOffset(),
TOKEN_TYPE_PATH_ELEMENT_NAME);
}
else
{
namespaceToken = new Token(text.substring(nsStartDelimiterLength,
(split + nsEndDelimiterLength - 1)), t.startOffset(), t.startOffset() + split,
TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
nameToken = new Token(text.substring(split + nsEndDelimiterLength), t.startOffset()
+ split + nsEndDelimiterLength, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
}
}
namespaceToken.setPositionIncrement(1);
nameToken.setPositionIncrement(1);
if (includeNamespace)
{
if (namespaceToken.termText().equals(""))
{
namespaceToken = new Token(noNsTokenText, t.startOffset(), t.startOffset(),
TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
namespaceToken.setPositionIncrement(1);
}
tokens.add(namespaceToken);
}
tokens.add(nameToken);
lengthCounter++;
if (pathSplitToken != null)
{
String countString = nf.format(lengthCounter);
countToken = new Token(countString, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP);
countToken.setPositionIncrement(1);
tokens.add(insertCountAt, countToken);
tokens.add(pathSplitToken);
lengthCounter = 0;
insertCountAt = tokens.size();
pathSplitToken = null;
}
}
String countString = nf.format(lengthCounter);
countToken = new Token(countString, 0, 0, TOKEN_TYPE_PATH_SEP);
countToken.setPositionIncrement(1);
tokens.add(insertCountAt, countToken);
if ((tokens.size() == 0) || !(tokens.get(tokens.size() - 1).termText().equals(TOKEN_TYPE_PATH_SEP)))
{
pathSplitToken = new Token(separatorTokenText, 0, 0, TOKEN_TYPE_PATH_SEP);
pathSplitToken.setPositionIncrement(1);
tokens.add(pathSplitToken);
}
it = tokens.iterator();
}
int readerPosition = 0;
private Token nextToken() throws IOException
{
if (readerPosition == -1)
{
return null;
}
StringBuilder buffer = new StringBuilder(64);
boolean inNameSpace = false;
int start = readerPosition;
int current;
char c;
while ((current = input.read()) != -1)
{
c = (char) current;
readerPosition++;
if (c == nsStartDelimiter)
{
inNameSpace = true;
}
else if (c == nsEndDelimiter)
{
inNameSpace = false;
}
else if (!inNameSpace && (c == '/'))
{
return new Token(buffer.toString(), start, readerPosition - 1, "QNAME");
}
else if (!inNameSpace && (c == ';'))
{
buffer.append(c);
return new Token(buffer.toString(), start, readerPosition, "LASTQNAME");
}
buffer.append(c);
}
int end = readerPosition - 1;
readerPosition = -1;
if (!inNameSpace)
{
return new Token(buffer.toString(), start, end, "QNAME");
}
else
{
throw new IllegalStateException("QName terminated incorrectly: " + buffer.toString());
}
}
}

View File

@@ -1,133 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
public class PathTokenFilterTest extends TestCase
{
public PathTokenFilterTest()
{
super();
}
public PathTokenFilterTest(String arg0)
{
super(arg0);
}
public void testFullPath() throws IOException
{
tokenise("{uri1}one", new String[]{"uri1", "one"});
tokenise("/{uri1}one", new String[]{"uri1", "one"});
tokenise("{uri1}one/{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("/{uri1}one/{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("{uri1}one/{uri2}two/{uri3}three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
tokenise("/{uri1}one/{uri2}two/{uri3}three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
try
{
tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
}
catch(IllegalStateException ise)
{
}
}
public void testPrefixPath() throws IOException
{
tokenise("uri1:one", new String[]{"uri1", "one"});
tokenise("/uri1:one", new String[]{"uri1", "one"});
tokenise("uri1:one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("/uri1:one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("uri1:one/uri2:two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
tokenise("/uri1:one/uri2:two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
try
{
tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
}
catch(IllegalStateException ise)
{
}
}
public void testMixedPath() throws IOException
{
tokenise("{uri1}one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("/{uri1}one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("uri1:one/{uri2}two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
tokenise("/uri1:one/{uri2}two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
try
{
tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
}
catch(IllegalStateException ise)
{
}
}
private void tokenise(String path, String[] tokens) throws IOException
{
StringReader reader = new StringReader(path);
TokenStream ts = new PathTokenFilter(reader, PathTokenFilter.PATH_SEPARATOR,
PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
Token t;
int i = 0;
while( (t = ts.next()) != null)
{
if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE))
{
assert(i % 2 == 0);
assertEquals(t.termText(), tokens[i++]);
}
else if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX))
{
assert(i % 2 == 0);
assertEquals(t.termText(), tokens[i++]);
}
else if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME))
{
assert(i % 2 == 1);
assertEquals(t.termText(), tokens[i++]);
}
}
if(i != tokens.length)
{
fail("Invalid number of tokens, found "+i+" and expected "+tokens.length);
}
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class PorterSnowballAnalyser extends SnowballAnalyzer
{
public PorterSnowballAnalyser()
{
super("Porter");
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class PortugueseSnowballAnalyser extends SnowballAnalyzer
{
public PortugueseSnowballAnalyser()
{
super("Portuguese");
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class RussianSnowballAnalyser extends SnowballAnalyzer
{
public RussianSnowballAnalyser()
{
super("Russian");
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class SpanishSnowballAnalyser extends SnowballAnalyzer
{
public SpanishSnowballAnalyser()
{
super("Spanish");
}
}

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
public class SwedishSnowballAnalyser extends SnowballAnalyzer
{
public SwedishSnowballAnalyser()
{
super("Swedish");
}
}

View File

@@ -1,45 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
public class VerbatimAnalyser extends Analyzer
{
boolean lowerCase;
public VerbatimAnalyser()
{
lowerCase = false;
}
public VerbatimAnalyser(boolean lowerCase)
{
super();
this.lowerCase = lowerCase;
}
public TokenStream tokenStream(String fieldName, Reader reader)
{
return new VerbatimTokenFilter(reader, lowerCase);
}
}

View File

@@ -1,163 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Locale;
import org.alfresco.repo.search.MLAnalysisMode;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
public class VerbatimMLAnalayser extends Analyzer
{
private static Log s_logger = LogFactory.getLog(VerbatimMLAnalayser.class);
private MLAnalysisMode mlAnalaysisMode;
public VerbatimMLAnalayser(MLAnalysisMode mlAnalaysisMode)
{
this.mlAnalaysisMode = mlAnalaysisMode;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader)
{
// We use read ahead to get the language info - if this does not exist we need to restart
// an use the default - there foer we need mark and restore.
if (!(reader instanceof BufferedReader))
{
BufferedReader breader = new BufferedReader(reader);
try
{
if (!breader.markSupported())
{
throw new AnalysisException(
"Multilingual tokenisation requires a reader that supports marks and reset");
}
breader.mark(100);
StringBuilder builder = new StringBuilder();
if (breader.read() == '\u0000')
{
String language = "";
String country = "";
String varient = "";
char c;
int count = 0;
while ((c = (char) breader.read()) != '\u0000')
{
if (count++ > 99)
{
breader.reset();
return getAnalyser().tokenStream(fieldName, breader);
}
if (c == '_')
{
if (language.length() == 0)
{
language = builder.toString();
}
else if (country.length() == 0)
{
country = builder.toString();
}
else if (varient.length() == 0)
{
varient = builder.toString();
}
else
{
breader.reset();
return getAnalyser().tokenStream(fieldName, breader);
}
builder = new StringBuilder();
}
else
{
builder.append(c);
}
}
if (builder.length() > 0)
{
if (language.length() == 0)
{
language = builder.toString();
}
else if (country.length() == 0)
{
country = builder.toString();
}
else if (varient.length() == 0)
{
varient = builder.toString();
}
else
{
breader.reset();
return getAnalyser().tokenStream(fieldName, breader);
}
}
Locale locale = new Locale(language, country, varient);
// leave the reader where it is ....
return new MLTokenDuplicator(getAnalyser().tokenStream(fieldName, breader), locale, breader, mlAnalaysisMode);
}
else
{
breader.reset();
return getAnalyser().tokenStream(fieldName, breader);
}
}
catch (IOException io)
{
try
{
breader.reset();
}
catch (IOException e)
{
throw new AnalysisException("Failed to reset buffered reader - token stream will be invalid", e);
}
return getAnalyser().tokenStream(fieldName, breader);
}
}
else
{
throw new AnalysisException("Multilingual tokenisation requires a buffered reader");
}
}
/**
* @return
*/
private Analyzer getAnalyser()
{
return new VerbatimAnalyser(false);
}
}

View File

@@ -1,67 +0,0 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
public class VerbatimTokenFilter extends Tokenizer
{
boolean readInput = true;
boolean lowerCase;
VerbatimTokenFilter(Reader in, boolean lowerCase)
{
super(in);
this.lowerCase = lowerCase;
}
@Override
public Token next() throws IOException
{
if (readInput)
{
readInput = false;
StringBuilder buffer = new StringBuilder();
int current;
char c;
while ((current = input.read()) != -1)
{
c = (char) current;
buffer.append(c);
}
String token = buffer.toString();
if(lowerCase)
{
token = token.toLowerCase();
}
return new Token(token, 0, token.length() - 1, "VERBATIM");
}
else
{
return null;
}
}
}