Fix fuzzy, prefix and wildcard queries for MLText typed attributes

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@5723 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Andrew Hind
2007-05-18 15:38:16 +00:00
parent e3223d97fb
commit 518da86649
2 changed files with 128 additions and 15 deletions

View File

@@ -219,8 +219,6 @@ public class ADMLuceneTest extends TestCase
testTX = transactionService.getUserTransaction(); testTX = transactionService.getUserTransaction();
testTX.begin(); testTX.begin();
this.authenticationComponent.setSystemUserAsCurrentUser(); this.authenticationComponent.setSystemUserAsCurrentUser();
// load in the test model // load in the test model
@@ -316,7 +314,7 @@ public class ADMLuceneTest extends TestCase
// - andit has to go in type d:any as d:content is not allowed to be multivalued // - andit has to go in type d:any as d:content is not allowed to be multivalued
ArrayList<Serializable> contentValues = new ArrayList<Serializable>(); ArrayList<Serializable> contentValues = new ArrayList<Serializable>();
contentValues.add(new ContentData(null, "text/plain", 0L, "UTF-16", Locale.CHINESE )); contentValues.add(new ContentData(null, "text/plain", 0L, "UTF-16", Locale.UK ));
testProperties.put(QName.createQName(TEST_NAMESPACE, "content-many-ista"), contentValues); testProperties.put(QName.createQName(TEST_NAMESPACE, "content-many-ista"), contentValues);
@@ -381,7 +379,7 @@ public class ADMLuceneTest extends TestCase
getOrderProperties()).getChildRef(); getOrderProperties()).getChildRef();
Map<QName, Serializable> properties = new HashMap<QName, Serializable>(); Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
properties.put(ContentModel.PROP_CONTENT, new ContentData(null, "text/plain", 0L, "UTF-8", Locale.CHINESE )); properties.put(ContentModel.PROP_CONTENT, new ContentData(null, "text/plain", 0L, "UTF-8", Locale.UK ));
n14 = nodeService.createNode(n13, ASSOC_TYPE_QNAME, QName.createQName("{namespace}fourteen"), n14 = nodeService.createNode(n13, ASSOC_TYPE_QNAME, QName.createQName("{namespace}fourteen"),
ContentModel.TYPE_CONTENT, properties).getChildRef(); ContentModel.TYPE_CONTENT, properties).getChildRef();
// nodeService.addAspect(n14, DictionaryBootstrap.ASPECT_QNAME_CONTENT, // nodeService.addAspect(n14, DictionaryBootstrap.ASPECT_QNAME_CONTENT,
@@ -2612,6 +2610,61 @@ public class ADMLuceneTest extends TestCase
assertEquals(1, results.length()); assertEquals(1, results.length());
results.close(); results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":cabba*");
sp.addLocale(new Locale("en"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":ca*ge");
sp.addLocale(new Locale("en"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":*bage");
sp.addLocale(new Locale("en"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":cabage~");
sp.addLocale(new Locale("en"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":*b?ag?");
sp.addLocale(new Locale("en"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":cho*");
sp.setMlAnalaysisMode(MLAnalysisMode.LOCALE_AND_ALL_CONTAINED_LOCALES);
sp.addLocale(new Locale("fr"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
// multivalued content in type d:any // multivalued content in type d:any
// This should not be indexed as we can not know what to do with content here. // This should not be indexed as we can not know what to do with content here.
@@ -2756,6 +2809,31 @@ public class ADMLuceneTest extends TestCase
assertEquals(1, results.length()); assertEquals(1, results.length());
results.close(); results.close();
// Test wildcards in text
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:laz*", null, null);
assertEquals(1, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:laz~", null, null);
assertEquals(1, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:la?y", null, null);
assertEquals(1, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:?a?y", null, null);
assertEquals(1, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:*azy", null, null);
assertEquals(1, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:*az*", null, null);
assertEquals(1, results.length());
results.close();
// Accents // Accents

View File

@@ -37,6 +37,8 @@ import java.util.Set;
import org.alfresco.i18n.I18NUtil; import org.alfresco.i18n.I18NUtil;
import org.alfresco.repo.search.MLAnalysisMode; import org.alfresco.repo.search.MLAnalysisMode;
import org.alfresco.repo.search.SearcherException; import org.alfresco.repo.search.SearcherException;
import org.alfresco.repo.search.impl.lucene.analysis.MLTokenDuplicator;
import org.alfresco.repo.search.impl.lucene.analysis.VerbatimAnalyser;
import org.alfresco.repo.search.impl.lucene.query.PathQuery; import org.alfresco.repo.search.impl.lucene.query.PathQuery;
import org.alfresco.service.cmr.dictionary.AspectDefinition; import org.alfresco.service.cmr.dictionary.AspectDefinition;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition; import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
@@ -48,6 +50,7 @@ import org.alfresco.service.namespace.NamespacePrefixResolver;
import org.alfresco.service.namespace.QName; import org.alfresco.service.namespace.QName;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
@@ -408,7 +411,7 @@ public class LuceneQueryParser extends QueryParser
} }
else if (field.startsWith("@")) else if (field.startsWith("@"))
{ {
Query query = attributeQueryBuilder(field, queryText, new FieldQuery()); Query query = attributeQueryBuilder(field, queryText, new FieldQuery(), true);
return query; return query;
} }
else if (field.equals("ALL")) else if (field.equals("ALL"))
@@ -636,7 +639,7 @@ public class LuceneQueryParser extends QueryParser
{ {
if (field.startsWith("@")) if (field.startsWith("@"))
{ {
return attributeQueryBuilder(field, termStr, new PrefixQuery()); return attributeQueryBuilder(field, termStr, new PrefixQuery(), false);
} }
else if (field.equals("TEXT")) else if (field.equals("TEXT"))
{ {
@@ -668,7 +671,7 @@ public class LuceneQueryParser extends QueryParser
{ {
if (field.startsWith("@")) if (field.startsWith("@"))
{ {
return attributeQueryBuilder(field, termStr, new WildcardQuery()); return attributeQueryBuilder(field, termStr, new WildcardQuery(), false);
} }
else if (field.equals("TEXT")) else if (field.equals("TEXT"))
@@ -701,7 +704,7 @@ public class LuceneQueryParser extends QueryParser
{ {
if (field.startsWith("@")) if (field.startsWith("@"))
{ {
return attributeQueryBuilder(field, termStr, new FuzzyQuery(minSimilarity)); return attributeQueryBuilder(field, termStr, new FuzzyQuery(minSimilarity), false);
} }
else if (field.equals("TEXT")) else if (field.equals("TEXT"))
@@ -798,7 +801,7 @@ public class LuceneQueryParser extends QueryParser
} }
} }
private Query attributeQueryBuilder(String field, String queryText, SubQuery subQueryBuilder) throws ParseException private Query attributeQueryBuilder(String field, String queryText, SubQuery subQueryBuilder, boolean isAnalysed) throws ParseException
{ {
// Expand prefixes // Expand prefixes
@@ -850,6 +853,9 @@ public class LuceneQueryParser extends QueryParser
List<Locale> locales = searchParameters.getLocales(); List<Locale> locales = searchParameters.getLocales();
for (Locale locale : (((locales == null) || (locales.size() == 0)) ? Collections.singletonList(I18NUtil for (Locale locale : (((locales == null) || (locales.size() == 0)) ? Collections.singletonList(I18NUtil
.getLocale()) : locales)) .getLocale()) : locales))
{
if(isAnalysed)
{ {
StringBuilder builder = new StringBuilder(queryText.length() + 10); StringBuilder builder = new StringBuilder(queryText.length() + 10);
builder.append("\u0000").append(locale.toString()).append("\u0000").append(queryText); builder.append("\u0000").append(locale.toString()).append("\u0000").append(queryText);
@@ -863,6 +869,35 @@ public class LuceneQueryParser extends QueryParser
booleanQuery.add(new TermQuery(new Term("NO_TOKENS", "__")), Occur.SHOULD); booleanQuery.add(new TermQuery(new Term("NO_TOKENS", "__")), Occur.SHOULD);
} }
} }
else
{
// analyse ml text
MLAnalysisMode analysisMode = searchParameters.getMlAnalaysisMode() == null ? config
.getDefaultMLSearchAnalysisMode() : searchParameters.getMlAnalaysisMode();
// Do the analysis here
VerbatimAnalyser vba = new VerbatimAnalyser(false);
MLTokenDuplicator duplicator = new MLTokenDuplicator(vba.tokenStream(field, new StringReader(queryText)), locale, null, analysisMode);
Token t;
try
{
while( (t = duplicator.next()) != null)
{
Query subQuery = subQueryBuilder.getQuery(expandedFieldName, t.termText());
booleanQuery.add(subQuery, Occur.SHOULD);
}
}
catch (IOException e)
{
}
if(booleanQuery.getClauses().length == 0)
{
booleanQuery.add(new TermQuery(new Term("NO_TOKENS", "__")), Occur.SHOULD);
}
}
}
return booleanQuery; return booleanQuery;
} }
// Content // Content