SEARCH-2: Switch to multi-value indentifier field for fingerprint

This commit is contained in:
Joel
2016-09-27 15:49:43 -04:00
parent d000ed8bce
commit b99aad037a
5 changed files with 57 additions and 34 deletions

View File

@@ -3087,26 +3087,21 @@ public class SolrInformationServer implements InformationServer
Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer();
TokenStream ts = analyzer.tokenStream("min_hash", textContent);
StringBuilder hashBuff = new StringBuilder();
CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken())
{
StringBuilder tokenBuff = new StringBuilder();
if(hashBuff.length() > 0) {
hashBuff.append(" ");
}
char[] buff = termAttribute.buffer();
for(int i=0; i<termAttribute.length();i++) {
tokenBuff.append(Integer.toHexString(buff[i]));
}
hashBuff.append(tokenBuff.toString());
doc.addField(FINGERPRINT, tokenBuff.toString());
}
ts.end();
ts.close();
doc.addField(FINGERPRINT, hashBuff.toString());
}
long end = System.nanoTime();

View File

@@ -703,8 +703,7 @@ public class Solr4QueryParser extends QueryParser implements QueryConstants
SolrInputField mh = solrDoc.getField("FINGERPRINT");
if (mh != null)
{
String fingerprint = mh.getValue().toString();
String[] tokens = fingerprint.split(" ");
Collection values = mh.getValues();
int bandSize = 1;
float fraction = -1;
float truePositive = 1;
@@ -723,14 +722,14 @@ public class Solr4QueryParser extends QueryParser implements QueryConstants
{
truePositive /= 100;
}
bandSize = computeBandSize(tokens.length, fraction, truePositive);
bandSize = computeBandSize(values.size(), fraction, truePositive);
}
BooleanQuery.Builder builder = new BooleanQuery.Builder();
BooleanQuery.Builder childBuilder = new BooleanQuery.Builder();
int rowInBand = 0;
for (String token : tokens)
for (Object token : values)
{
TermQuery tq = new TermQuery(new Term("FINGERPRINT", token));
TermQuery tq = new TermQuery(new Term("FINGERPRINT", token.toString()));
if (bandSize == 1)
{
builder.add(new ConstantScoreQuery(tq), Occur.SHOULD);
@@ -751,9 +750,9 @@ public class Solr4QueryParser extends QueryParser implements QueryConstants
// start
if (childBuilder.build().clauses().size() > 0)
{
for (String token : tokens)
for (Object token : values)
{
TermQuery tq = new TermQuery(new Term("FINGERPRINT", token));
TermQuery tq = new TermQuery(new Term("FINGERPRINT", token.toString()));
childBuilder.add(new ConstantScoreQuery(tq), Occur.MUST);
rowInBand++;
if (rowInBand == bandSize)
@@ -767,7 +766,7 @@ public class Solr4QueryParser extends QueryParser implements QueryConstants
builder.setDisableCoord(true);
if (parts.length == 2)
{
builder.setMinimumNumberShouldMatch((int) (Math.ceil(tokens.length * fraction)));
builder.setMinimumNumberShouldMatch((int) (Math.ceil(values.size() * fraction)));
}
Query q = builder.build();
return q;

View File

@@ -564,7 +564,7 @@
<fields>
<!-- For SOLR cloud - should be the node version -->
<field name="_version_" type="version" indexed="false" stored="true" docValues="true" required="true" />
<field name="FINGERPRINT" type="text_plain" indexed="true" omitNorms="true" stored="false" multiValued="false" required="false" docValues="false"/>
<field name="FINGERPRINT" type="indentifier" indexed="true" omitNorms="true" stored="false" multiValued="true" required="false" docValues="false"/>
<!-- For block join - currently not used -->
<field name="_root_" type="identifier" indexed="true" stored="false"/>

View File

@@ -19,10 +19,7 @@
package org.alfresco.solr.query;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.search.adaptor.lucene.QueryConstants;
import org.alfresco.service.cmr.repository.NodeRef;
import org.alfresco.service.cmr.repository.StoreRef;
import org.alfresco.solr.AbstractAlfrescoSolrTests;
import org.alfresco.solr.client.*;
import org.apache.commons.logging.Log;
@@ -39,12 +36,9 @@ import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
import static org.alfresco.solr.AlfrescoSolrUtils.*;
import static org.alfresco.solr.AlfrescoSolrUtils.ancestors;
import static org.alfresco.solr.AlfrescoSolrUtils.createGUID;
@LuceneTestCase.SuppressCodecs({"Appending","Lucene3x","Lucene40","Lucene41","Lucene42","Lucene43", "Lucene44", "Lucene45","Lucene46","Lucene47","Lucene48","Lucene49"})
public class AlfrescoSolrFingerprintTest extends AbstractAlfrescoSolrTests
@@ -129,10 +123,10 @@ public class AlfrescoSolrFingerprintTest extends AbstractAlfrescoSolrTests
NodeMetaData nodeMetaData3 = getNodeMetaData(node3, txn, acl, "mike", null, false);
NodeMetaData nodeMetaData4 = getNodeMetaData(node4, txn, acl, "mike", null, false);
List content = list("aaaa bbbb cccc dddd eeee ffff hhhh iiii jjjj kkkk",
"aaaa bbbb cccc dddd eeee ffff hhhh iiii",
"aaaa bbbb cccc dddd eeee ffff hhhh iiii jjjj",
"aaaa bbbb cccc dddd eeee ffff hhhh");
List content = list("aaaa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25",
"aaaa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20",
"aaaa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24",
"aaaa 1 2 3 4 5 6 7 8 9 10 11 12 13 14");
//Index the transaction, nodes, and nodeMetaDatas.
//Note that the content is automatically created by the test framework.
@@ -160,19 +154,54 @@ public class AlfrescoSolrFingerprintTest extends AbstractAlfrescoSolrTests
logger.info("#################### Passed Third Test ##############################");
ModifiableSolrParams params = new ModifiableSolrParams();
params.add("q", "FINGERPRINT:" + node1.getId()); //Query for an id in the content field. The node id is automatically populated into the content field by test framework
params.add("qt", "/afts");
params.add("fl", "DBID,score");
params.add("start", "0");
params.add("rows", "6");
SolrServletRequest req = areq(params, null);
assertQ(req, "*[count(//doc)=4]",
"//result/doc[1]/long[@name='DBID'][.='" + node1.getId() + "']",
"//result/doc[2]/long[@name='DBID'][.='" + node3.getId() + "']",
"//result/doc[3]/long[@name='DBID'][.='" + node2.getId() + "']",
"//result/doc[4]/long[@name='DBID'][.='" + node4.getId() + "']");
params = new ModifiableSolrParams();
params.add("q", "FINGERPRINT:" + node1.getId() + "_90"); //Query for an id in the content field. The node id is automatically populated into the content field by test framework
params.add("qt", "/afts");
params.add("fl","DBID,score");
params.add("start", "0");
params.add("rows", "6");
SolrServletRequest req = areq(params, null);
assertQ(req, "//result/doc[1]/long[@name='DBID'][.='"+node1.getId()+"']",
"//result/doc[2]/long[@name='DBID'][.='"+node3.getId()+"']",
"//result/doc[3]/long[@name='DBID'][.='"+node2.getId()+"']",
"//result/doc[4]/long[@name='DBID'][.='"+node4.getId()+"']");
req = areq(params, null);
assertQ(req, "*[count(//doc)= 2]",
"//result/doc[1]/long[@name='DBID'][.='"+node1.getId()+"']",
"//result/doc[2]/long[@name='DBID'][.='"+node3.getId()+"']");
params = new ModifiableSolrParams();
params.add("q", "FINGERPRINT:" + node1.getId()+"_70"); //Query for an id in the content field. The node id is automatically populated into the content field by test framework
params.add("qt", "/afts");
params.add("fl","DBID,score");
params.add("start", "0");
params.add("rows", "6");
req = areq(params, null);
assertQ(req, "*[count(//doc)= 3]",
"//result/doc[1]/long[@name='DBID'][.='"+node1.getId()+"']",
"//result/doc[2]/long[@name='DBID'][.='"+node3.getId()+"']",
"//result/doc[3]/long[@name='DBID'][.='"+node2.getId()+"']");
params = new ModifiableSolrParams();
params.add("q", "FINGERPRINT:" + node1.getId()+"_40");
params.add("qt", "/afts");
params.add("fl","DBID,score");
params.add("start", "0");
params.add("rows", "6");
req = areq(params, null);
assertQ(req, "*[count(//doc)= 4]",
"//result/doc[1]/long[@name='DBID'][.='"+node1.getId()+"']",
"//result/doc[2]/long[@name='DBID'][.='"+node3.getId()+"']",
"//result/doc[3]/long[@name='DBID'][.='"+node2.getId()+"']",
"//result/doc[4]/long[@name='DBID'][.='"+node4.getId()+"']");
}
}

View File

@@ -571,7 +571,7 @@
<!-- Unique identifier - based on DBID -->
<field name="id" type="identifier" indexed="true" omitNorms="true" stored="true" multiValued="false" required="true" docValues="true"/>
<field name="FINGERPRINT" type="text_plain" indexed="true" omitNorms="true" stored="false" multiValued="false" required="false" docValues="false"/>
<field name="FINGERPRINT" type="identifier" indexed="true" omitNorms="true" stored="false" multiValued="true" required="false" docValues="false"/>
<!-- Special fields -->
<field name="LID" type="identifier" indexed="true" omitNorms="true" stored="false" multiValued="false" sortMissingLast="true" />