Fix fuzzy, prefix and wildcard queries for MLText typed attributes

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@5723 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Andrew Hind
2007-05-18 15:38:16 +00:00
parent e3223d97fb
commit 518da86649
2 changed files with 128 additions and 15 deletions

View File

@@ -219,8 +219,6 @@ public class ADMLuceneTest extends TestCase
testTX = transactionService.getUserTransaction();
testTX.begin();
this.authenticationComponent.setSystemUserAsCurrentUser();
// load in the test model
@@ -316,7 +314,7 @@ public class ADMLuceneTest extends TestCase
// - andit has to go in type d:any as d:content is not allowed to be multivalued
ArrayList<Serializable> contentValues = new ArrayList<Serializable>();
contentValues.add(new ContentData(null, "text/plain", 0L, "UTF-16", Locale.CHINESE ));
contentValues.add(new ContentData(null, "text/plain", 0L, "UTF-16", Locale.UK ));
testProperties.put(QName.createQName(TEST_NAMESPACE, "content-many-ista"), contentValues);
@@ -381,7 +379,7 @@ public class ADMLuceneTest extends TestCase
getOrderProperties()).getChildRef();
Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
properties.put(ContentModel.PROP_CONTENT, new ContentData(null, "text/plain", 0L, "UTF-8", Locale.CHINESE ));
properties.put(ContentModel.PROP_CONTENT, new ContentData(null, "text/plain", 0L, "UTF-8", Locale.UK ));
n14 = nodeService.createNode(n13, ASSOC_TYPE_QNAME, QName.createQName("{namespace}fourteen"),
ContentModel.TYPE_CONTENT, properties).getChildRef();
// nodeService.addAspect(n14, DictionaryBootstrap.ASPECT_QNAME_CONTENT,
@@ -2612,6 +2610,61 @@ public class ADMLuceneTest extends TestCase
assertEquals(1, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":cabba*");
sp.addLocale(new Locale("en"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":ca*ge");
sp.addLocale(new Locale("en"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":*bage");
sp.addLocale(new Locale("en"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":cabage~");
sp.addLocale(new Locale("en"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":*b?ag?");
sp.addLocale(new Locale("en"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
sp = new SearchParameters();
sp.addStore(rootNodeRef.getStoreRef());
sp.setLanguage("lucene");
sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":cho*");
sp.setMlAnalaysisMode(MLAnalysisMode.LOCALE_AND_ALL_CONTAINED_LOCALES);
sp.addLocale(new Locale("fr"));
results = searcher.query(sp);
assertEquals(1, results.length());
results.close();
// multivalued content in type d:any
// This should not be indexed as we can not know what to do with content here.
@@ -2756,6 +2809,31 @@ public class ADMLuceneTest extends TestCase
assertEquals(1, results.length());
results.close();
// Test wildcards in text
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:laz*", null, null);
assertEquals(1, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:laz~", null, null);
assertEquals(1, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:la?y", null, null);
assertEquals(1, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:?a?y", null, null);
assertEquals(1, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:*azy", null, null);
assertEquals(1, results.length());
results.close();
results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:*az*", null, null);
assertEquals(1, results.length());
results.close();
// Accents

View File

@@ -37,6 +37,8 @@ import java.util.Set;
import org.alfresco.i18n.I18NUtil;
import org.alfresco.repo.search.MLAnalysisMode;
import org.alfresco.repo.search.SearcherException;
import org.alfresco.repo.search.impl.lucene.analysis.MLTokenDuplicator;
import org.alfresco.repo.search.impl.lucene.analysis.VerbatimAnalyser;
import org.alfresco.repo.search.impl.lucene.query.PathQuery;
import org.alfresco.service.cmr.dictionary.AspectDefinition;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
@@ -48,6 +50,7 @@ import org.alfresco.service.namespace.NamespacePrefixResolver;
import org.alfresco.service.namespace.QName;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
@@ -408,7 +411,7 @@ public class LuceneQueryParser extends QueryParser
}
else if (field.startsWith("@"))
{
Query query = attributeQueryBuilder(field, queryText, new FieldQuery());
Query query = attributeQueryBuilder(field, queryText, new FieldQuery(), true);
return query;
}
else if (field.equals("ALL"))
@@ -636,7 +639,7 @@ public class LuceneQueryParser extends QueryParser
{
if (field.startsWith("@"))
{
return attributeQueryBuilder(field, termStr, new PrefixQuery());
return attributeQueryBuilder(field, termStr, new PrefixQuery(), false);
}
else if (field.equals("TEXT"))
{
@@ -668,7 +671,7 @@ public class LuceneQueryParser extends QueryParser
{
if (field.startsWith("@"))
{
return attributeQueryBuilder(field, termStr, new WildcardQuery());
return attributeQueryBuilder(field, termStr, new WildcardQuery(), false);
}
else if (field.equals("TEXT"))
@@ -701,7 +704,7 @@ public class LuceneQueryParser extends QueryParser
{
if (field.startsWith("@"))
{
return attributeQueryBuilder(field, termStr, new FuzzyQuery(minSimilarity));
return attributeQueryBuilder(field, termStr, new FuzzyQuery(minSimilarity), false);
}
else if (field.equals("TEXT"))
@@ -798,7 +801,7 @@ public class LuceneQueryParser extends QueryParser
}
}
private Query attributeQueryBuilder(String field, String queryText, SubQuery subQueryBuilder) throws ParseException
private Query attributeQueryBuilder(String field, String queryText, SubQuery subQueryBuilder, boolean isAnalysed) throws ParseException
{
// Expand prefixes
@@ -850,6 +853,9 @@ public class LuceneQueryParser extends QueryParser
List<Locale> locales = searchParameters.getLocales();
for (Locale locale : (((locales == null) || (locales.size() == 0)) ? Collections.singletonList(I18NUtil
.getLocale()) : locales))
{
if(isAnalysed)
{
StringBuilder builder = new StringBuilder(queryText.length() + 10);
builder.append("\u0000").append(locale.toString()).append("\u0000").append(queryText);
@@ -863,6 +869,35 @@ public class LuceneQueryParser extends QueryParser
booleanQuery.add(new TermQuery(new Term("NO_TOKENS", "__")), Occur.SHOULD);
}
}
else
{
// analyse ml text
MLAnalysisMode analysisMode = searchParameters.getMlAnalaysisMode() == null ? config
.getDefaultMLSearchAnalysisMode() : searchParameters.getMlAnalaysisMode();
// Do the analysis here
VerbatimAnalyser vba = new VerbatimAnalyser(false);
MLTokenDuplicator duplicator = new MLTokenDuplicator(vba.tokenStream(field, new StringReader(queryText)), locale, null, analysisMode);
Token t;
try
{
while( (t = duplicator.next()) != null)
{
Query subQuery = subQueryBuilder.getQuery(expandedFieldName, t.termText());
booleanQuery.add(subQuery, Occur.SHOULD);
}
}
catch (IOException e)
{
}
if(booleanQuery.getClauses().length == 0)
{
booleanQuery.add(new TermQuery(new Term("NO_TOKENS", "__")), Occur.SHOULD);
}
}
}
return booleanQuery;
}
// Content