SEARCH-2: Switch to multi-value indentifier field for fingerprint

This commit is contained in:
Joel
2016-09-27 15:49:43 -04:00
parent d000ed8bce
commit b99aad037a
5 changed files with 57 additions and 34 deletions

View File

@@ -3087,26 +3087,21 @@ public class SolrInformationServer implements InformationServer
Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer(); Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer();
TokenStream ts = analyzer.tokenStream("min_hash", textContent); TokenStream ts = analyzer.tokenStream("min_hash", textContent);
StringBuilder hashBuff = new StringBuilder();
CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class); CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
ts.reset(); ts.reset();
while (ts.incrementToken()) while (ts.incrementToken())
{ {
StringBuilder tokenBuff = new StringBuilder(); StringBuilder tokenBuff = new StringBuilder();
if(hashBuff.length() > 0) {
hashBuff.append(" ");
}
char[] buff = termAttribute.buffer(); char[] buff = termAttribute.buffer();
for(int i=0; i<termAttribute.length();i++) { for(int i=0; i<termAttribute.length();i++) {
tokenBuff.append(Integer.toHexString(buff[i])); tokenBuff.append(Integer.toHexString(buff[i]));
} }
hashBuff.append(tokenBuff.toString()); doc.addField(FINGERPRINT, tokenBuff.toString());
} }
ts.end(); ts.end();
ts.close(); ts.close();
doc.addField(FINGERPRINT, hashBuff.toString());
} }
long end = System.nanoTime(); long end = System.nanoTime();

View File

@@ -703,8 +703,7 @@ public class Solr4QueryParser extends QueryParser implements QueryConstants
SolrInputField mh = solrDoc.getField("FINGERPRINT"); SolrInputField mh = solrDoc.getField("FINGERPRINT");
if (mh != null) if (mh != null)
{ {
String fingerprint = mh.getValue().toString(); Collection values = mh.getValues();
String[] tokens = fingerprint.split(" ");
int bandSize = 1; int bandSize = 1;
float fraction = -1; float fraction = -1;
float truePositive = 1; float truePositive = 1;
@@ -723,14 +722,14 @@ public class Solr4QueryParser extends QueryParser implements QueryConstants
{ {
truePositive /= 100; truePositive /= 100;
} }
bandSize = computeBandSize(tokens.length, fraction, truePositive); bandSize = computeBandSize(values.size(), fraction, truePositive);
} }
BooleanQuery.Builder builder = new BooleanQuery.Builder(); BooleanQuery.Builder builder = new BooleanQuery.Builder();
BooleanQuery.Builder childBuilder = new BooleanQuery.Builder(); BooleanQuery.Builder childBuilder = new BooleanQuery.Builder();
int rowInBand = 0; int rowInBand = 0;
for (String token : tokens) for (Object token : values)
{ {
TermQuery tq = new TermQuery(new Term("FINGERPRINT", token)); TermQuery tq = new TermQuery(new Term("FINGERPRINT", token.toString()));
if (bandSize == 1) if (bandSize == 1)
{ {
builder.add(new ConstantScoreQuery(tq), Occur.SHOULD); builder.add(new ConstantScoreQuery(tq), Occur.SHOULD);
@@ -751,9 +750,9 @@ public class Solr4QueryParser extends QueryParser implements QueryConstants
// start // start
if (childBuilder.build().clauses().size() > 0) if (childBuilder.build().clauses().size() > 0)
{ {
for (String token : tokens) for (Object token : values)
{ {
TermQuery tq = new TermQuery(new Term("FINGERPRINT", token)); TermQuery tq = new TermQuery(new Term("FINGERPRINT", token.toString()));
childBuilder.add(new ConstantScoreQuery(tq), Occur.MUST); childBuilder.add(new ConstantScoreQuery(tq), Occur.MUST);
rowInBand++; rowInBand++;
if (rowInBand == bandSize) if (rowInBand == bandSize)
@@ -767,7 +766,7 @@ public class Solr4QueryParser extends QueryParser implements QueryConstants
builder.setDisableCoord(true); builder.setDisableCoord(true);
if (parts.length == 2) if (parts.length == 2)
{ {
builder.setMinimumNumberShouldMatch((int) (Math.ceil(tokens.length * fraction))); builder.setMinimumNumberShouldMatch((int) (Math.ceil(values.size() * fraction)));
} }
Query q = builder.build(); Query q = builder.build();
return q; return q;

View File

@@ -564,7 +564,7 @@
<fields> <fields>
<!-- For SOLR cloud - should be the node version --> <!-- For SOLR cloud - should be the node version -->
<field name="_version_" type="version" indexed="false" stored="true" docValues="true" required="true" /> <field name="_version_" type="version" indexed="false" stored="true" docValues="true" required="true" />
<field name="FINGERPRINT" type="text_plain" indexed="true" omitNorms="true" stored="false" multiValued="false" required="false" docValues="false"/> <field name="FINGERPRINT" type="indentifier" indexed="true" omitNorms="true" stored="false" multiValued="true" required="false" docValues="false"/>
<!-- For block join - currently not used --> <!-- For block join - currently not used -->
<field name="_root_" type="identifier" indexed="true" stored="false"/> <field name="_root_" type="identifier" indexed="true" stored="false"/>

View File

@@ -19,10 +19,7 @@
package org.alfresco.solr.query; package org.alfresco.solr.query;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.search.adaptor.lucene.QueryConstants; import org.alfresco.repo.search.adaptor.lucene.QueryConstants;
import org.alfresco.service.cmr.repository.NodeRef;
import org.alfresco.service.cmr.repository.StoreRef;
import org.alfresco.solr.AbstractAlfrescoSolrTests; import org.alfresco.solr.AbstractAlfrescoSolrTests;
import org.alfresco.solr.client.*; import org.alfresco.solr.client.*;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
@@ -39,12 +36,9 @@ import org.junit.Before;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import static org.alfresco.solr.AlfrescoSolrUtils.*; import static org.alfresco.solr.AlfrescoSolrUtils.*;
import static org.alfresco.solr.AlfrescoSolrUtils.ancestors;
import static org.alfresco.solr.AlfrescoSolrUtils.createGUID;
@LuceneTestCase.SuppressCodecs({"Appending","Lucene3x","Lucene40","Lucene41","Lucene42","Lucene43", "Lucene44", "Lucene45","Lucene46","Lucene47","Lucene48","Lucene49"}) @LuceneTestCase.SuppressCodecs({"Appending","Lucene3x","Lucene40","Lucene41","Lucene42","Lucene43", "Lucene44", "Lucene45","Lucene46","Lucene47","Lucene48","Lucene49"})
public class AlfrescoSolrFingerprintTest extends AbstractAlfrescoSolrTests public class AlfrescoSolrFingerprintTest extends AbstractAlfrescoSolrTests
@@ -129,10 +123,10 @@ public class AlfrescoSolrFingerprintTest extends AbstractAlfrescoSolrTests
NodeMetaData nodeMetaData3 = getNodeMetaData(node3, txn, acl, "mike", null, false); NodeMetaData nodeMetaData3 = getNodeMetaData(node3, txn, acl, "mike", null, false);
NodeMetaData nodeMetaData4 = getNodeMetaData(node4, txn, acl, "mike", null, false); NodeMetaData nodeMetaData4 = getNodeMetaData(node4, txn, acl, "mike", null, false);
List content = list("aaaa bbbb cccc dddd eeee ffff hhhh iiii jjjj kkkk", List content = list("aaaa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25",
"aaaa bbbb cccc dddd eeee ffff hhhh iiii", "aaaa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20",
"aaaa bbbb cccc dddd eeee ffff hhhh iiii jjjj", "aaaa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24",
"aaaa bbbb cccc dddd eeee ffff hhhh"); "aaaa 1 2 3 4 5 6 7 8 9 10 11 12 13 14");
//Index the transaction, nodes, and nodeMetaDatas. //Index the transaction, nodes, and nodeMetaDatas.
//Note that the content is automatically created by the test framework. //Note that the content is automatically created by the test framework.
@@ -160,16 +154,51 @@ public class AlfrescoSolrFingerprintTest extends AbstractAlfrescoSolrTests
logger.info("#################### Passed Third Test ##############################"); logger.info("#################### Passed Third Test ##############################");
ModifiableSolrParams params = new ModifiableSolrParams(); ModifiableSolrParams params = new ModifiableSolrParams();
params.add("q", "FINGERPRINT:" + node1.getId()); //Query for an id in the content field. The node id is automatically populated into the content field by test framework params.add("q", "FINGERPRINT:" + node1.getId()); //Query for an id in the content field. The node id is automatically populated into the content field by test framework
params.add("qt", "/afts"); params.add("qt", "/afts");
params.add("fl", "DBID,score");
params.add("start", "0");
params.add("rows", "6");
SolrServletRequest req = areq(params, null);
assertQ(req, "*[count(//doc)=4]",
"//result/doc[1]/long[@name='DBID'][.='" + node1.getId() + "']",
"//result/doc[2]/long[@name='DBID'][.='" + node3.getId() + "']",
"//result/doc[3]/long[@name='DBID'][.='" + node2.getId() + "']",
"//result/doc[4]/long[@name='DBID'][.='" + node4.getId() + "']");
params = new ModifiableSolrParams();
params.add("q", "FINGERPRINT:" + node1.getId() + "_90"); //Query for an id in the content field. The node id is automatically populated into the content field by test framework
params.add("qt", "/afts");
params.add("fl","DBID,score"); params.add("fl","DBID,score");
params.add("start", "0"); params.add("start", "0");
params.add("rows", "6"); params.add("rows", "6");
SolrServletRequest req = areq(params, null); req = areq(params, null);
assertQ(req, "//result/doc[1]/long[@name='DBID'][.='"+node1.getId()+"']", assertQ(req, "*[count(//doc)= 2]",
"//result/doc[1]/long[@name='DBID'][.='"+node1.getId()+"']",
"//result/doc[2]/long[@name='DBID'][.='"+node3.getId()+"']");
params = new ModifiableSolrParams();
params.add("q", "FINGERPRINT:" + node1.getId()+"_70"); //Query for an id in the content field. The node id is automatically populated into the content field by test framework
params.add("qt", "/afts");
params.add("fl","DBID,score");
params.add("start", "0");
params.add("rows", "6");
req = areq(params, null);
assertQ(req, "*[count(//doc)= 3]",
"//result/doc[1]/long[@name='DBID'][.='"+node1.getId()+"']",
"//result/doc[2]/long[@name='DBID'][.='"+node3.getId()+"']",
"//result/doc[3]/long[@name='DBID'][.='"+node2.getId()+"']");
params = new ModifiableSolrParams();
params.add("q", "FINGERPRINT:" + node1.getId()+"_40");
params.add("qt", "/afts");
params.add("fl","DBID,score");
params.add("start", "0");
params.add("rows", "6");
req = areq(params, null);
assertQ(req, "*[count(//doc)= 4]",
"//result/doc[1]/long[@name='DBID'][.='"+node1.getId()+"']",
"//result/doc[2]/long[@name='DBID'][.='"+node3.getId()+"']", "//result/doc[2]/long[@name='DBID'][.='"+node3.getId()+"']",
"//result/doc[3]/long[@name='DBID'][.='"+node2.getId()+"']", "//result/doc[3]/long[@name='DBID'][.='"+node2.getId()+"']",
"//result/doc[4]/long[@name='DBID'][.='"+node4.getId()+"']"); "//result/doc[4]/long[@name='DBID'][.='"+node4.getId()+"']");

View File

@@ -571,7 +571,7 @@
<!-- Unique identifier - based on DBID --> <!-- Unique identifier - based on DBID -->
<field name="id" type="identifier" indexed="true" omitNorms="true" stored="true" multiValued="false" required="true" docValues="true"/> <field name="id" type="identifier" indexed="true" omitNorms="true" stored="true" multiValued="false" required="true" docValues="true"/>
<field name="FINGERPRINT" type="text_plain" indexed="true" omitNorms="true" stored="false" multiValued="false" required="false" docValues="false"/> <field name="FINGERPRINT" type="identifier" indexed="true" omitNorms="true" stored="false" multiValued="true" required="false" docValues="false"/>
<!-- Special fields --> <!-- Special fields -->
<field name="LID" type="identifier" indexed="true" omitNorms="true" stored="false" multiValued="false" sortMissingLast="true" /> <field name="LID" type="identifier" indexed="true" omitNorms="true" stored="false" multiValued="false" sortMissingLast="true" />