From 518da8664921a08fd56584174f9dac5938b041e6 Mon Sep 17 00:00:00 2001 From: Andrew Hind Date: Fri, 18 May 2007 15:38:16 +0000 Subject: [PATCH] Fix fuzzy, prefix and wildcard queries for MLText typed attributes git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@5723 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- .../search/impl/lucene/ADMLuceneTest.java | 86 ++++++++++++++++++- .../search/impl/lucene/LuceneQueryParser.java | 57 +++++++++--- 2 files changed, 128 insertions(+), 15 deletions(-) diff --git a/source/java/org/alfresco/repo/search/impl/lucene/ADMLuceneTest.java b/source/java/org/alfresco/repo/search/impl/lucene/ADMLuceneTest.java index 8e4180df27..8f9ba86c63 100644 --- a/source/java/org/alfresco/repo/search/impl/lucene/ADMLuceneTest.java +++ b/source/java/org/alfresco/repo/search/impl/lucene/ADMLuceneTest.java @@ -219,8 +219,6 @@ public class ADMLuceneTest extends TestCase testTX = transactionService.getUserTransaction(); testTX.begin(); - - this.authenticationComponent.setSystemUserAsCurrentUser(); // load in the test model @@ -316,7 +314,7 @@ public class ADMLuceneTest extends TestCase // - andit has to go in type d:any as d:content is not allowed to be multivalued ArrayList contentValues = new ArrayList(); - contentValues.add(new ContentData(null, "text/plain", 0L, "UTF-16", Locale.CHINESE )); + contentValues.add(new ContentData(null, "text/plain", 0L, "UTF-16", Locale.UK )); testProperties.put(QName.createQName(TEST_NAMESPACE, "content-many-ista"), contentValues); @@ -381,7 +379,7 @@ public class ADMLuceneTest extends TestCase getOrderProperties()).getChildRef(); Map properties = new HashMap(); - properties.put(ContentModel.PROP_CONTENT, new ContentData(null, "text/plain", 0L, "UTF-8", Locale.CHINESE )); + properties.put(ContentModel.PROP_CONTENT, new ContentData(null, "text/plain", 0L, "UTF-8", Locale.UK )); n14 = nodeService.createNode(n13, ASSOC_TYPE_QNAME, QName.createQName("{namespace}fourteen"), ContentModel.TYPE_CONTENT, properties).getChildRef(); // nodeService.addAspect(n14, DictionaryBootstrap.ASPECT_QNAME_CONTENT, @@ -2612,6 +2610,61 @@ public class ADMLuceneTest extends TestCase assertEquals(1, results.length()); results.close(); + sp = new SearchParameters(); + sp.addStore(rootNodeRef.getStoreRef()); + sp.setLanguage("lucene"); + sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":cabba*"); + sp.addLocale(new Locale("en")); + results = searcher.query(sp); + assertEquals(1, results.length()); + results.close(); + + sp = new SearchParameters(); + sp.addStore(rootNodeRef.getStoreRef()); + sp.setLanguage("lucene"); + sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":ca*ge"); + sp.addLocale(new Locale("en")); + results = searcher.query(sp); + assertEquals(1, results.length()); + results.close(); + + sp = new SearchParameters(); + sp.addStore(rootNodeRef.getStoreRef()); + sp.setLanguage("lucene"); + sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":*bage"); + sp.addLocale(new Locale("en")); + results = searcher.query(sp); + assertEquals(1, results.length()); + results.close(); + + sp = new SearchParameters(); + sp.addStore(rootNodeRef.getStoreRef()); + sp.setLanguage("lucene"); + sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":cabage~"); + sp.addLocale(new Locale("en")); + results = searcher.query(sp); + assertEquals(1, results.length()); + results.close(); + + sp = new SearchParameters(); + sp.addStore(rootNodeRef.getStoreRef()); + sp.setLanguage("lucene"); + sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":*b?ag?"); + sp.addLocale(new Locale("en")); + results = searcher.query(sp); + assertEquals(1, results.length()); + results.close(); + + sp = new SearchParameters(); + sp.addStore(rootNodeRef.getStoreRef()); + sp.setLanguage("lucene"); + sp.setQuery("@" + LuceneQueryParser.escape(multimlQName.toString()) + ":cho*"); + sp.setMlAnalaysisMode(MLAnalysisMode.LOCALE_AND_ALL_CONTAINED_LOCALES); + sp.addLocale(new Locale("fr")); + results = searcher.query(sp); + assertEquals(1, results.length()); + results.close(); + // multivalued content in type d:any // This should not be indexed as we can not know what to do with content here. @@ -2755,7 +2808,32 @@ public class ADMLuceneTest extends TestCase results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:\"over a lazy\"", null, null); assertEquals(1, results.length()); results.close(); + + // Test wildcards in text + + results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:laz*", null, null); + assertEquals(1, results.length()); + results.close(); + + results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:laz~", null, null); + assertEquals(1, results.length()); + results.close(); + + results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:la?y", null, null); + assertEquals(1, results.length()); + results.close(); + results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:?a?y", null, null); + assertEquals(1, results.length()); + results.close(); + + results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:*azy", null, null); + assertEquals(1, results.length()); + results.close(); + + results = searcher.query(rootNodeRef.getStoreRef(), "lucene", "TEXT:*az*", null, null); + assertEquals(1, results.length()); + results.close(); // Accents diff --git a/source/java/org/alfresco/repo/search/impl/lucene/LuceneQueryParser.java b/source/java/org/alfresco/repo/search/impl/lucene/LuceneQueryParser.java index 9d291159d0..3eae3d67de 100644 --- a/source/java/org/alfresco/repo/search/impl/lucene/LuceneQueryParser.java +++ b/source/java/org/alfresco/repo/search/impl/lucene/LuceneQueryParser.java @@ -37,6 +37,8 @@ import java.util.Set; import org.alfresco.i18n.I18NUtil; import org.alfresco.repo.search.MLAnalysisMode; import org.alfresco.repo.search.SearcherException; +import org.alfresco.repo.search.impl.lucene.analysis.MLTokenDuplicator; +import org.alfresco.repo.search.impl.lucene.analysis.VerbatimAnalyser; import org.alfresco.repo.search.impl.lucene.query.PathQuery; import org.alfresco.service.cmr.dictionary.AspectDefinition; import org.alfresco.service.cmr.dictionary.DataTypeDefinition; @@ -48,6 +50,7 @@ import org.alfresco.service.namespace.NamespacePrefixResolver; import org.alfresco.service.namespace.QName; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; @@ -408,7 +411,7 @@ public class LuceneQueryParser extends QueryParser } else if (field.startsWith("@")) { - Query query = attributeQueryBuilder(field, queryText, new FieldQuery()); + Query query = attributeQueryBuilder(field, queryText, new FieldQuery(), true); return query; } else if (field.equals("ALL")) @@ -636,7 +639,7 @@ public class LuceneQueryParser extends QueryParser { if (field.startsWith("@")) { - return attributeQueryBuilder(field, termStr, new PrefixQuery()); + return attributeQueryBuilder(field, termStr, new PrefixQuery(), false); } else if (field.equals("TEXT")) { @@ -668,7 +671,7 @@ public class LuceneQueryParser extends QueryParser { if (field.startsWith("@")) { - return attributeQueryBuilder(field, termStr, new WildcardQuery()); + return attributeQueryBuilder(field, termStr, new WildcardQuery(), false); } else if (field.equals("TEXT")) @@ -701,7 +704,7 @@ public class LuceneQueryParser extends QueryParser { if (field.startsWith("@")) { - return attributeQueryBuilder(field, termStr, new FuzzyQuery(minSimilarity)); + return attributeQueryBuilder(field, termStr, new FuzzyQuery(minSimilarity), false); } else if (field.equals("TEXT")) @@ -798,7 +801,7 @@ public class LuceneQueryParser extends QueryParser } } - private Query attributeQueryBuilder(String field, String queryText, SubQuery subQueryBuilder) throws ParseException + private Query attributeQueryBuilder(String field, String queryText, SubQuery subQueryBuilder, boolean isAnalysed) throws ParseException { // Expand prefixes @@ -851,17 +854,49 @@ public class LuceneQueryParser extends QueryParser for (Locale locale : (((locales == null) || (locales.size() == 0)) ? Collections.singletonList(I18NUtil .getLocale()) : locales)) { - StringBuilder builder = new StringBuilder(queryText.length() + 10); - builder.append("\u0000").append(locale.toString()).append("\u0000").append(queryText); - Query subQuery = subQueryBuilder.getQuery(expandedFieldName, builder.toString()); - if (subQuery != null) + + if(isAnalysed) { - booleanQuery.add(subQuery, Occur.SHOULD); + StringBuilder builder = new StringBuilder(queryText.length() + 10); + builder.append("\u0000").append(locale.toString()).append("\u0000").append(queryText); + Query subQuery = subQueryBuilder.getQuery(expandedFieldName, builder.toString()); + if (subQuery != null) + { + booleanQuery.add(subQuery, Occur.SHOULD); + } + else + { + booleanQuery.add(new TermQuery(new Term("NO_TOKENS", "__")), Occur.SHOULD); + } } else { - booleanQuery.add(new TermQuery(new Term("NO_TOKENS", "__")), Occur.SHOULD); + // analyse ml text + MLAnalysisMode analysisMode = searchParameters.getMlAnalaysisMode() == null ? config + .getDefaultMLSearchAnalysisMode() : searchParameters.getMlAnalaysisMode(); + // Do the analysis here + VerbatimAnalyser vba = new VerbatimAnalyser(false); + MLTokenDuplicator duplicator = new MLTokenDuplicator(vba.tokenStream(field, new StringReader(queryText)), locale, null, analysisMode); + Token t; + try + { + while( (t = duplicator.next()) != null) + { + Query subQuery = subQueryBuilder.getQuery(expandedFieldName, t.termText()); + booleanQuery.add(subQuery, Occur.SHOULD); + } + } + catch (IOException e) + { + + } + if(booleanQuery.getClauses().length == 0) + { + booleanQuery.add(new TermQuery(new Term("NO_TOKENS", "__")), Occur.SHOULD); + } + } + } return booleanQuery; }