From dc94c779b0e2fef43745e3b76d30d42bb100ab5c Mon Sep 17 00:00:00 2001 From: Joel Date: Mon, 30 Jul 2018 12:53:06 -0400 Subject: [PATCH] SEARCH-1001: Add javadoc --- .../alfresco/solr/SolrInformationServer.java | 18 ++++++- .../solr/query/AbstractAuthoritySetQuery.java | 22 +++++++- .../solr/query/AlfrescoFTSQParserPlugin.java | 11 ++-- .../alfresco/solr/query/Solr4QueryParser.java | 8 +++ .../solr/query/SolrAuthoritySetQuery.java | 38 ++++++++++++++ .../solr/tracker/AbstractTracker.java | 5 ++ .../alfresco/solr/tracker/CommitTracker.java | 12 +++++ .../solr/tracker/MetadataTracker.java | 50 +++++++++++++++++++ 8 files changed, 159 insertions(+), 5 deletions(-) diff --git a/alfresco-search/src/main/java/org/alfresco/solr/SolrInformationServer.java b/alfresco-search/src/main/java/org/alfresco/solr/SolrInformationServer.java index 04ee580be..651477408 100644 --- a/alfresco-search/src/main/java/org/alfresco/solr/SolrInformationServer.java +++ b/alfresco-search/src/main/java/org/alfresco/solr/SolrInformationServer.java @@ -1439,7 +1439,23 @@ public class SolrInformationServer implements InformationServer /* * Choose the max between the last commit time in the index and the last time the tracker started. - * Hole retention is applied to both. * + * Hole retention is applied to both. + * + * This logic is very tricky and very important to understand. + * + * state.getLastGoodTxCommitTimeInIndex() is used to determine where to start pulling transactions from the repo on the + * current tracker run. + * + * If we simply take the current value of state.getLastIndexedTxCommitTime() we have the following problem: + * + * If no data is added to the repo for a long period of time state.getLastIndexedTxCommitTime() never moves forward. This causes the + * loop inside MetadataTracker.getSomeTransactions() to hammer the repo as the time between state.getLastIndexedTxCommitTime() + * and state.setTimeToStopIndexing increases. + * + * To resolve this we choose the max between the last commit time in the index and the last time the tracker started. In theory + * if we start looking for transactions after the last tracker was started (and apply hole retention), we should never miss a + * transaction. Or atleast ensure that principal behind hole retention is respected. This theory should be closely looked at if + * the trackers ever lose data. */ timeBeforeWhichThereCanBeNoTxHolesInIndex = Math.max(timeBeforeWhichThereCanBeNoTxHolesInIndex, lastStartTimeWhichThereCanBeNoTxHolesInIndex); diff --git a/alfresco-search/src/main/java/org/alfresco/solr/query/AbstractAuthoritySetQuery.java b/alfresco-search/src/main/java/org/alfresco/solr/query/AbstractAuthoritySetQuery.java index 8893a81bc..8cff300f2 100644 --- a/alfresco-search/src/main/java/org/alfresco/solr/query/AbstractAuthoritySetQuery.java +++ b/alfresco-search/src/main/java/org/alfresco/solr/query/AbstractAuthoritySetQuery.java @@ -82,15 +82,29 @@ public abstract class AbstractAuthoritySetQuery extends Query return authorities.hashCode(); } + + /* + * This method collects the bitset of documents that match the authorities. + */ + protected HybridBitSet getACLSet(String[] auths, String field, SolrIndexSearcher searcher) throws IOException { + /* + * Build a query that matches the authorities with a field in the ACL records in the index. + */ + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); for(String current : auths) { queryBuilder.add(new TermQuery(new Term(field, current)), BooleanClause.Occur.SHOULD); } - //NOTE: this query will be in the filter cache. Ideally it would remain cached throughout the users session. + + /* + * Collect a docset containing the ACL records that match the query. + * This query will be in the filter cache. Ideally it would remain cached throughout the users session. + */ + DocSet docSet = searcher.getDocSet(queryBuilder.build()); DocIterator iterator = docSet.iterator(); @@ -102,6 +116,12 @@ public abstract class AbstractAuthoritySetQuery extends Query //TODO : makes this configurable. For some systems this is huge and for others not big enough. HybridBitSet hybridBitSet = new HybridBitSet(60000000); + /* + * Collect the ACLID's from the matching acl records. + * This is done in a separate step so the initial ACL query can be cached in the FilterCache + * The initial ACL query may be expensive if the number of authorities is very large. + */ + List leaves = searcher.getTopReaderContext().leaves(); LeafReaderContext context = leaves.get(0); NumericDocValues aclValues = DocValuesCache.getNumericDocValues(QueryConstants.FIELD_ACLID, context.reader()); diff --git a/alfresco-search/src/main/java/org/alfresco/solr/query/AlfrescoFTSQParserPlugin.java b/alfresco-search/src/main/java/org/alfresco/solr/query/AlfrescoFTSQParserPlugin.java index bde1f1f95..45724f381 100644 --- a/alfresco-search/src/main/java/org/alfresco/solr/query/AlfrescoFTSQParserPlugin.java +++ b/alfresco-search/src/main/java/org/alfresco/solr/query/AlfrescoFTSQParserPlugin.java @@ -81,8 +81,9 @@ public class AlfrescoFTSQParserPlugin extends QParserPlugin rerankPhase = RerankPhase.valueOf(arg.toString()); } - //First check the System property. - //Then check solrcore.properties, defaulting to the postFilter. + /* + * This turns on the postFilter + */ postfilter = Boolean.parseBoolean(System.getProperty("alfresco.postfilter", req.getCore().getCoreDescriptor().getCoreProperty("alfresco.postfilter", @@ -109,7 +110,11 @@ public class AlfrescoFTSQParserPlugin extends QParserPlugin if(authset && postfilter) { - //Return the PostFilter + /* + * The cost of 200 turns on the postfilter inside Solr + * The postfilter query pulls out all the post filters in the + * query and applies them. + */ return new PostFilterQuery(200, query); } diff --git a/alfresco-search/src/main/java/org/alfresco/solr/query/Solr4QueryParser.java b/alfresco-search/src/main/java/org/alfresco/solr/query/Solr4QueryParser.java index fbe72dafb..a5b3f8a55 100644 --- a/alfresco-search/src/main/java/org/alfresco/solr/query/Solr4QueryParser.java +++ b/alfresco-search/src/main/java/org/alfresco/solr/query/Solr4QueryParser.java @@ -563,9 +563,17 @@ public class Solr4QueryParser extends QueryParser implements QueryConstants return createReaderSetQuery(queryText); } else if (field.equals(FIELD_AUTHORITY)) { + /* + * ACL DOCUMENTATION STARTS HERE + * This creates the query that applies the ACL filter + */ return createAuthorityQuery(queryText); } else if (field.equals(FIELD_AUTHORITYSET)) { + /* + * ACL DOCUMENTATION STARTS HERE + * This creates the query that applies the ACL filter + */ return createAuthoritySetQuery(queryText); } else if (field.equals(FIELD_DENIED)) { diff --git a/alfresco-search/src/main/java/org/alfresco/solr/query/SolrAuthoritySetQuery.java b/alfresco-search/src/main/java/org/alfresco/solr/query/SolrAuthoritySetQuery.java index 31d7bced1..e4d5c9cca 100644 --- a/alfresco-search/src/main/java/org/alfresco/solr/query/SolrAuthoritySetQuery.java +++ b/alfresco-search/src/main/java/org/alfresco/solr/query/SolrAuthoritySetQuery.java @@ -108,6 +108,21 @@ public class SolrAuthoritySetQuery extends AbstractAuthoritySetQuery implements } } + /* + * ACL PostFilter + * + * The getFilterCollector function returns a DelegatingCollector + * which is used to filter the documents that match the query. + * + * A delegating collector wraps the TopDocs Collector which gathers the top documents that + * match a query. A delegating can filter the documents before "delegating" to the TopDocs + * collector. This filtering process is where the ACL logic is applied. + * + * The getFilterCollector method sets up the data structures needed to apply the acl rules. + * These data structures are then passed to the access control collectors. + * + */ + public DelegatingCollector getFilterCollector(IndexSearcher searcher) { @@ -136,7 +151,19 @@ public class SolrAuthoritySetQuery extends AbstractAuthoritySetQuery implements try { + + /* + * Collect the ACLID's that match the authorities. + * This is done by querying the ACL records in the index. See the method for more + * documentation on this query. + */ + HybridBitSet aclSet = getACLSet(auths, QueryConstants.FIELD_READER, solrIndexSearcher); + + /* + * Collect the documents that the user owns. + */ + BitsFilter ownerFilter = getOwnerFilter(auths, solrIndexSearcher); if (globalReaders.contains(PermissionService.OWNER_AUTHORITY)) @@ -251,6 +278,11 @@ public class SolrAuthoritySetQuery extends AbstractAuthoritySetQuery implements } } + + /* + * The AccessControlCollector applies that ACL logic given aclIds and ownerFilter + */ + class AccessControlCollector extends DelegatingCollector { private HybridBitSet aclIds; @@ -276,6 +308,12 @@ public class SolrAuthoritySetQuery extends AbstractAuthoritySetQuery implements this.ownerDocs = ownerFilter.getBitSets().get(context.ord); } + /* + * The collect method is applied to each document that matches the + * query. The document's aclId must be in the set of aclId's passed into the collector, + * or the documents id must be in the ownerDocs. + */ + public void collect(int doc) throws IOException { long aclId = this.fieldValues.get(doc); diff --git a/alfresco-search/src/main/java/org/alfresco/solr/tracker/AbstractTracker.java b/alfresco-search/src/main/java/org/alfresco/solr/tracker/AbstractTracker.java index d55fc274c..7f38e7f38 100644 --- a/alfresco-search/src/main/java/org/alfresco/solr/tracker/AbstractTracker.java +++ b/alfresco-search/src/main/java/org/alfresco/solr/tracker/AbstractTracker.java @@ -180,6 +180,11 @@ public abstract class AbstractTracker implements Tracker try { + /* + * The runLock ensures that for each tracker type (metadata, content, commit, cascade) only one tracker will + * be running at a time. + */ + runLock.acquire(); if(state==null && Boolean.parseBoolean(System.getProperty("alfresco.test", "false"))) diff --git a/alfresco-search/src/main/java/org/alfresco/solr/tracker/CommitTracker.java b/alfresco-search/src/main/java/org/alfresco/solr/tracker/CommitTracker.java index ba0c2b6ff..3acbd4a51 100644 --- a/alfresco-search/src/main/java/org/alfresco/solr/tracker/CommitTracker.java +++ b/alfresco-search/src/main/java/org/alfresco/solr/tracker/CommitTracker.java @@ -131,6 +131,18 @@ public class CommitTracker extends AbstractTracker //See if we need a rollback if(metadataTracker.getRollback() || aclTracker.getRollback()) { + + /* + * The metadataTracker and aclTracker will return true if an unhandled exception has occurred during indexing. + * + * The doRollback method rolls the index back to the state that it was in at the last commit. This will undo + * all the work that has been done by other trackers after the last commit. + * + * The state of the other trackers is then set to null so the trackers will initialize their state from + * the index, rather then the in-memory state. This keeps the trackers in-sync with index if their work is + * rolled back. + */ + doRollback(); return; } diff --git a/alfresco-search/src/main/java/org/alfresco/solr/tracker/MetadataTracker.java b/alfresco-search/src/main/java/org/alfresco/solr/tracker/MetadataTracker.java index 824d47c00..d52c7bf24 100644 --- a/alfresco-search/src/main/java/org/alfresco/solr/tracker/MetadataTracker.java +++ b/alfresco-search/src/main/java/org/alfresco/solr/tracker/MetadataTracker.java @@ -143,6 +143,15 @@ public class MetadataTracker extends AbstractTracker implements Tracker if(!isMaster && isSlave) { // Dynamic registration + /* + * This section allows Solr's master/slave setup to be used with dynamic shard registration. + * In this scenario the slave is polling a "tracking" Solr node. The code below calls + * the repo to register the state of the node without pulling any real transactions from the repo. + * + * This allows the repo to register the replica so that it will be included in queries. But the slave Solr node + * will pull its data from a "tracking" Solr node using Solr's master/slave replication, rather then tracking the repository. + * + */ ShardState shardstate = getShardState(); client.getTransactions(0L, null, 0L, null, 0, shardstate); @@ -610,6 +619,11 @@ public class MetadataTracker extends AbstractTracker implements Tracker { try { + /* + * This write lock is used to lock out the Commit Tracker. The ensures that the MetaDataTracker will + * not be indexing content while commits or rollbacks are occurring. + */ + getWriteLock().acquire(); /* @@ -621,6 +635,26 @@ public class MetadataTracker extends AbstractTracker implements Tracker this.state = getTrackerState(); + /* + * The fromCommitTime tells getSomeTransactions() where to start, this actually fairly straight forward. + * + * What makes this code so tricky to understand is the state.getTimeToStopIndexing(). + * + * There are two scenarios to keep in mind: + * + * 1) Full re-index: In this scenario the state.getTimeToStopIndexing() will never stop the indexing. + * + * 2) Up-to-date indexing: This is where state.getTimeToStopIndexing() gets interesting. In this scenario + * the Solr index is already up to date with the repo and it is tracking new transactions. The state.getTimeToStopIndexing() + * in this scenario causes the getSomeTransactions() call to stop returning results if it finds a transaction + * beyond a specific point in time. This will break out of this loop and end the tracker run. + * + * The next time the metadata tracker runs the "continueState()" method applies the "hole retention" + * to state.getLastGoodTxCommitTimeInIndex(). This causes the state.getLastGoodTxCommitTimeInIndex() to scan + * for prior transactions that might have been missed. + * + */ + Long fromCommitTime = getTxFromCommitTime(txnsFound, state.getLastGoodTxCommitTimeInIndex()); transactions = getSomeTransactions(txnsFound, fromCommitTime, TIME_STEP_1_HR_IN_MS, 2000, state.getTimeToStopIndexing()); @@ -640,6 +674,22 @@ public class MetadataTracker extends AbstractTracker implements Tracker ArrayList txBatch = new ArrayList<>(); for (Transaction info : transactions.getTransactions()) { + + /* + * isInIndex is used to ensure transactions that are being re-pulled due to "hole retention" are not re-indexed if + * they have already been indexed. + * + * The logic in infoSrv.txnInIndex() first checks an in-memory LRUcache for the txnId. If it doesn't find it in the cache + * it checks the index. The LRUCache is only needed for txnId's that have been indexed but are not yet visible in the index for + * one of two reasons: + * + * 1) The commit tracker has not yet committed the transaction. + * 2) The txnId has been committed to the index but the new searcher has not yet been warmed. + * + * This means that to ensure txnId's are not needlessly reprocessed during hole retention, the LRUCache must be large + * enough to cover the time between when a txnId is indexed and when it becomes visible. + */ + boolean isInIndex = (infoSrv.txnInIndex(info.getId(), true) && info.getCommitTimeMs() <= state.getLastIndexedTxCommitTime()); if (isInIndex) { txnsFound.add(info);