Updates for locale based seraching and indexing

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@4737 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Andrew Hind
2007-01-05 13:07:30 +00:00
parent f2c6f03164
commit 736a7f1ed6
9 changed files with 1412 additions and 139 deletions

View File

@@ -131,9 +131,9 @@ public class LuceneIndexerAndSearcherFactory2 implements LuceneIndexerAndSearche
private String lockDirectory;
private MLAnalysisMode defaultMLIndexAnalysisMode = MLAnalysisMode.LOCALE_AND_ALL;
private MLAnalysisMode defaultMLIndexAnalysisMode = MLAnalysisMode.EXACT_LANGUAGE_AND_ALL;
private MLAnalysisMode defaultMLSearchAnalysisMode = MLAnalysisMode.LOCALE_AND_ALL_CONTAINING_LOCALES_AND_ALL;
private MLAnalysisMode defaultMLSearchAnalysisMode = MLAnalysisMode.EXACT_LANGUAGE_AND_ALL;
/**
* Private constructor for the singleton TODO: FIt in with IOC
@@ -1170,9 +1170,10 @@ public class LuceneIndexerAndSearcherFactory2 implements LuceneIndexerAndSearche
return defaultMLIndexAnalysisMode;
}
public void setDefaultMLIndexAnalysisMode(String mode)
public void setDefaultMLIndexAnalysisMode(MLAnalysisMode mode)
{
defaultMLIndexAnalysisMode = MLAnalysisMode.getMLAnalysisMode(mode);
//defaultMLIndexAnalysisMode = MLAnalysisMode.getMLAnalysisMode(mode);
defaultMLIndexAnalysisMode = mode;
}
public MLAnalysisMode getDefaultMLSearchAnalysisMode()
@@ -1180,9 +1181,10 @@ public class LuceneIndexerAndSearcherFactory2 implements LuceneIndexerAndSearche
return defaultMLSearchAnalysisMode;
}
public void setDefaultMLSearchAnalysisMode(String mode)
public void setDefaultMLSearchAnalysisMode(MLAnalysisMode mode)
{
defaultMLSearchAnalysisMode = MLAnalysisMode.getMLAnalysisMode(mode);
//defaultMLSearchAnalysisMode = MLAnalysisMode.getMLAnalysisMode(mode);
defaultMLSearchAnalysisMode = mode;
}

View File

@@ -18,6 +18,7 @@ package org.alfresco.repo.search.impl.lucene;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
@@ -26,6 +27,7 @@ import java.util.Locale;
import java.util.Set;
import org.alfresco.i18n.I18NUtil;
import org.alfresco.repo.search.MLAnalysisMode;
import org.alfresco.repo.search.SearcherException;
import org.alfresco.repo.search.impl.lucene.query.PathQuery;
import org.alfresco.service.cmr.dictionary.AspectDefinition;
@@ -59,6 +61,8 @@ public class LuceneQueryParser extends QueryParser
private SearchParameters searchParameters;
private LuceneConfig config;
/**
* Parses a query string, returning a {@link org.apache.lucene.search.Query}.
*
@@ -68,12 +72,13 @@ public class LuceneQueryParser extends QueryParser
* the default field for query terms.
* @param analyzer
* used to find terms in the query text.
* @param config
* @throws ParseException
* if the parsing fails
*/
static public Query parse(String query, String field, Analyzer analyzer,
NamespacePrefixResolver namespacePrefixResolver, DictionaryService dictionaryService,
Operator defaultOperator, SearchParameters searchParameters) throws ParseException
Operator defaultOperator, SearchParameters searchParameters, LuceneConfig config) throws ParseException
{
if (s_logger.isDebugEnabled())
{
@@ -84,6 +89,7 @@ public class LuceneQueryParser extends QueryParser
parser.setNamespacePrefixResolver(namespacePrefixResolver);
parser.setDictionaryService(dictionaryService);
parser.setSearchParameters(searchParameters);
parser.setLuceneConfig(config);
// TODO: Apply locale contstraints at the top level if required for the non ML doc types.
Query result = parser.parse(query);
if (s_logger.isDebugEnabled())
@@ -93,6 +99,11 @@ public class LuceneQueryParser extends QueryParser
return result;
}
private void setLuceneConfig(LuceneConfig config)
{
this.config = config;
}
private void setSearchParameters(SearchParameters searchParameters)
{
this.searchParameters = searchParameters;
@@ -158,7 +169,10 @@ public class LuceneQueryParser extends QueryParser
{
// The super implementation will create phrase queries etc if required
Query part = getFieldQuery("@" + qname.toString(), queryText);
query.add(part, Occur.SHOULD);
if (part != null)
{
query.add(part, Occur.SHOULD);
}
}
return query;
}
@@ -168,7 +182,10 @@ public class LuceneQueryParser extends QueryParser
for (String fieldName : text)
{
Query part = getFieldQuery(fieldName, queryText);
query.add(part, Occur.SHOULD);
if (part != null)
{
query.add(part, Occur.SHOULD);
}
}
return query;
}
@@ -251,7 +268,10 @@ public class LuceneQueryParser extends QueryParser
for (QName qname : subclasses)
{
TermQuery termQuery = new TermQuery(new Term(field, qname.toString()));
booleanQuery.add(termQuery, Occur.SHOULD);
if (termQuery != null)
{
booleanQuery.add(termQuery, Occur.SHOULD);
}
}
return booleanQuery;
}
@@ -333,7 +353,10 @@ public class LuceneQueryParser extends QueryParser
for (QName qname : subclasses)
{
TermQuery termQuery = new TermQuery(new Term(field, qname.toString()));
booleanQuery.add(termQuery, Occur.SHOULD);
if (termQuery != null)
{
booleanQuery.add(termQuery, Occur.SHOULD);
}
}
return booleanQuery;
}
@@ -369,7 +392,8 @@ public class LuceneQueryParser extends QueryParser
}
else if (field.startsWith("@"))
{
return attributeQueryBuilder(field, queryText, new FieldQuery());
Query query = attributeQueryBuilder(field, queryText, new FieldQuery());
return query;
}
else if (field.equals("ALL"))
{
@@ -414,9 +438,12 @@ public class LuceneQueryParser extends QueryParser
QName container = pd.getContainerClass().getName();
BooleanQuery query = new BooleanQuery();
Query typeQuery = getFieldQuery("TYPE", container.toString());
query.add(typeQuery, Occur.MUST);
Query presenceQuery = getWildcardQuery("@" + qname.toString(), "*");
query.add(presenceQuery, Occur.MUST_NOT);
if ((typeQuery != null) && (presenceQuery != null))
{
query.add(typeQuery, Occur.MUST);
query.add(presenceQuery, Occur.MUST_NOT);
}
return query;
}
else
@@ -435,9 +462,12 @@ public class LuceneQueryParser extends QueryParser
QName container = pd.getContainerClass().getName();
BooleanQuery query = new BooleanQuery();
Query typeQuery = getFieldQuery("TYPE", container.toString());
query.add(typeQuery, Occur.MUST);
Query presenceQuery = getWildcardQuery("@" + qname.toString(), "*");
query.add(presenceQuery, Occur.MUST);
if ((typeQuery != null) && (presenceQuery != null))
{
query.add(typeQuery, Occur.MUST);
query.add(presenceQuery, Occur.MUST);
}
return query;
}
else
@@ -455,7 +485,10 @@ public class LuceneQueryParser extends QueryParser
{
// The super implementation will create phrase queries etc if required
Query part = getFieldQuery("@" + qname.toString(), queryText);
query.add(part, Occur.SHOULD);
if (part != null)
{
query.add(part, Occur.SHOULD);
}
}
return query;
}
@@ -585,7 +618,10 @@ public class LuceneQueryParser extends QueryParser
{
// The super implementation will create phrase queries etc if required
Query part = getPrefixQuery("@" + qname.toString(), termStr);
query.add(part, Occur.SHOULD);
if (part != null)
{
query.add(part, Occur.SHOULD);
}
}
return query;
}
@@ -611,7 +647,10 @@ public class LuceneQueryParser extends QueryParser
{
// The super implementation will create phrase queries etc if required
Query part = getWildcardQuery("@" + qname.toString(), termStr);
query.add(part, Occur.SHOULD);
if (part != null)
{
query.add(part, Occur.SHOULD);
}
}
return query;
}
@@ -637,7 +676,10 @@ public class LuceneQueryParser extends QueryParser
{
// The super implementation will create phrase queries etc if required
Query part = getFuzzyQuery("@" + qname.toString(), termStr, minSimilarity);
query.add(part, Occur.SHOULD);
if (part != null)
{
query.add(part, Occur.SHOULD);
}
}
return query;
}
@@ -772,7 +814,10 @@ public class LuceneQueryParser extends QueryParser
StringBuilder builder = new StringBuilder(queryText.length() + 10);
builder.append("\u0000").append(locale.toString()).append("\u0000").append(queryText);
Query subQuery = subQueryBuilder.getQuery(expandedFieldName, builder.toString());
booleanQuery.add(subQuery, Occur.SHOULD);
if (subQuery != null)
{
booleanQuery.add(subQuery, Occur.SHOULD);
}
}
return booleanQuery;
}
@@ -781,21 +826,62 @@ public class LuceneQueryParser extends QueryParser
{
// Build a sub query for each locale and or the results together -
// - add an explicit condition for the locale
BooleanQuery booleanQuery = new BooleanQuery();
MLAnalysisMode analysisMode = searchParameters.getMlAnalaysisMode() == null ? config
.getDefaultMLSearchAnalysisMode() : searchParameters.getMlAnalaysisMode();
if (analysisMode.includesAll())
{
return subQueryBuilder.getQuery(expandedFieldName, queryText);
}
List<Locale> locales = searchParameters.getLocales();
List<Locale> expandedLocales = new ArrayList<Locale>();
for (Locale locale : (((locales == null) || (locales.size() == 0)) ? Collections.singletonList(I18NUtil
.getLocale()) : locales))
{
BooleanQuery subQuery = new BooleanQuery();
Query contentQuery = subQueryBuilder.getQuery(expandedFieldName, queryText);
subQuery.add(contentQuery, Occur.MUST);
StringBuilder builder = new StringBuilder();
builder.append(expandedFieldName).append(".locale");
Query localeQuery = getFieldQuery(builder.toString(), locale.toString());
subQuery.add(localeQuery, Occur.MUST);
booleanQuery.add(subQuery, Occur.SHOULD);
expandedLocales.addAll(MLAnalysisMode.getLocales(analysisMode, locale, true));
}
return booleanQuery;
if (expandedLocales.size() > 0)
{
BooleanQuery booleanQuery = new BooleanQuery();
Query contentQuery = subQueryBuilder.getQuery(expandedFieldName, queryText);
if (contentQuery != null)
{
booleanQuery.add(contentQuery, Occur.MUST);
BooleanQuery subQuery = new BooleanQuery();
for (Locale locale : (expandedLocales))
{
StringBuilder builder = new StringBuilder();
builder.append(expandedFieldName).append(".locale");
String localeString = locale.toString();
if (localeString.indexOf("*") == -1)
{
Query localeQuery = getFieldQuery(builder.toString(), localeString);
if (localeQuery != null)
{
subQuery.add(localeQuery, Occur.SHOULD);
}
}
else
{
Query localeQuery = getWildcardQuery(builder.toString(), localeString);
if (localeQuery != null)
{
subQuery.add(localeQuery, Occur.SHOULD);
}
}
}
booleanQuery.add(subQuery, Occur.MUST);
}
return booleanQuery;
}
else
{
return subQueryBuilder.getQuery(expandedFieldName, queryText);
}
}
else
{

View File

@@ -215,7 +215,7 @@ public class LuceneSearcherImpl2 extends LuceneBase2 implements LuceneSearcher2
}
Query query = LuceneQueryParser.parse(parameterisedQueryString, DEFAULT_FIELD, new LuceneAnalyser(
dictionaryService, searchParameters.getMlAnalaysisMode() == null ? getLuceneConfig().getDefaultMLSearchAnalysisMode() : searchParameters.getMlAnalaysisMode()), namespacePrefixResolver, dictionaryService, defaultOperator, searchParameters);
dictionaryService, searchParameters.getMlAnalaysisMode() == null ? getLuceneConfig().getDefaultMLSearchAnalysisMode() : searchParameters.getMlAnalaysisMode()), namespacePrefixResolver, dictionaryService, defaultOperator, searchParameters, getLuceneConfig());
ClosingIndexSearcher searcher = getSearcher(indexer);
if (searcher == null)
{

View File

@@ -2543,6 +2543,14 @@ public class LuceneTest2 extends TestCase
// Test stop words are equivalent
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:\"the\"", null, null);
assertEquals(0, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:\"and\"", null, null);
assertEquals(0, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:\"over the lazy\"", null, null);
assertEquals(1, results.length());
results.close();
@@ -2685,11 +2693,41 @@ public class LuceneTest2 extends TestCase
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
// locale serach in en_US for en_UK
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("d\\:content:\"fox\"");
sp.addLocale(Locale.US);
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
// Direct ML tests
QName mlQName = QName.createQName(TEST_NAMESPACE, "ml");
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setMlAnalaysisMode(MLAnalysisMode.ALL_ONLY);
sp.setQuery("@" + LuceneQueryParser.escape(mlQName.toString()) + ":and");
results = searcher.query(sp);
assertEquals(0, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setMlAnalaysisMode(MLAnalysisMode.ALL_ONLY);
sp.setQuery("@" + LuceneQueryParser.escape(mlQName.toString()) + ":\"and\"");
results = searcher.query(sp);
assertEquals(0, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");

View File

@@ -3,6 +3,7 @@ package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
@@ -38,91 +39,20 @@ public class MLTokenDuplicator extends Tokenizer
this.source = source;
this.locale = locale;
boolean l = locale.getLanguage().length() != 0;
boolean c = locale.getCountry().length() != 0;
boolean v = locale.getVariant().length() != 0;
prefixes = new HashSet<String>(4);
if (mlAnalaysisMode.includesAll())
Collection<Locale> locales = MLAnalysisMode.getLocales(mlAnalaysisMode, locale, false);
prefixes = new HashSet<String>(locales.size());
for(Locale toAdd : locales)
{
prefixes.add("");
}
if (mlAnalaysisMode.includesExact())
{
StringBuffer result = new StringBuffer();
result.append("{").append(locale.toString()).append("}");
prefixes.add(result.toString());
}
if (mlAnalaysisMode.includesContaining())
{
if (v)
String localeString = toAdd.toString();
if(localeString.length() == 0)
{
Locale noVarient = new Locale(locale.getLanguage(), locale.getCountry(), "");
StringBuffer result = new StringBuffer();
result.append("{").append(noVarient.toString()).append("}");
prefixes.add(result.toString());
Locale noCountry = new Locale(locale.getLanguage(), "", "");
result = new StringBuffer();
result.append("{").append(noCountry.toString()).append("}");
prefixes.add(result.toString());
prefixes.add("");
}
if (c)
else
{
Locale noCountry = new Locale(locale.getLanguage(), "", "");
StringBuffer result = new StringBuffer();
result.append("{").append(noCountry.toString()).append("}");
prefixes.add(result.toString());
}
}
if (mlAnalaysisMode.includesContained())
{
// varients have not contained
if (!v)
{
if (!c)
{
if (!l)
{
// All
for (Locale toAdd : Locale.getAvailableLocales())
{
StringBuffer result = new StringBuffer();
result.append("{").append(toAdd.toString()).append("}");
prefixes.add(result.toString());
}
}
else
{
// All that match language
for (Locale toAdd : Locale.getAvailableLocales())
{
if (locale.getLanguage().equals(toAdd.getLanguage()))
{
StringBuffer result = new StringBuffer();
result.append("{").append(toAdd.toString()).append("}");
prefixes.add(result.toString());
}
}
}
}
else
{
// All that match language and country
for (Locale toAdd : Locale.getAvailableLocales())
{
if ((locale.getLanguage().equals(toAdd.getLanguage()))
&& (locale.getCountry().equals(toAdd.getCountry())))
{
StringBuffer result = new StringBuffer();
result.append("{").append(toAdd.toString()).append("}");
prefixes.add(result.toString());
}
}
}
StringBuilder builder = new StringBuilder(16);
builder.append("{").append(localeString).append("}");
prefixes.add(builder.toString());
}
}
if(s_logger.isDebugEnabled())