SEARCH-1001: Add javadoc

2025-09-17 14:21:20 +00:00 · 2018-07-30 12:53:06 -04:00
parent 18689904a0
commit dc94c779b0
8 changed files with 159 additions and 5 deletions
--- a/alfresco-search/src/main/java/org/alfresco/solr/SolrInformationServer.java
+++ b/alfresco-search/src/main/java/org/alfresco/solr/SolrInformationServer.java
@@ -1439,7 +1439,23 @@ public class SolrInformationServer implements InformationServer

        /*
        * Choose the max between the last commit time in the index and the last time the tracker started.
-        * Hole retention is applied to both.        *
+        * Hole retention is applied to both.
+        *
+        * This logic is very tricky and very important to understand.
+        *
+        * state.getLastGoodTxCommitTimeInIndex() is used to determine where to start pulling transactions from the repo on the
+        * current tracker run.
+        *
+        * If we simply take the current value of  state.getLastIndexedTxCommitTime() we have the following problem:
+        *
+        * If no data is added to the repo for a long period of time state.getLastIndexedTxCommitTime() never moves forward. This causes the
+        * loop inside MetadataTracker.getSomeTransactions() to hammer the repo as the time between state.getLastIndexedTxCommitTime()
+        * and state.setTimeToStopIndexing increases.
+        *
+        * To resolve this we choose the max between the last commit time in the index and the last time the tracker started. In theory
+        * if we start looking for transactions after the last tracker was started (and apply hole retention), we should never miss a
+        * transaction. Or atleast ensure that principal behind hole retention is respected. This theory should be closely looked at if
+        * the trackers ever lose data.
        */

        timeBeforeWhichThereCanBeNoTxHolesInIndex = Math.max(timeBeforeWhichThereCanBeNoTxHolesInIndex, lastStartTimeWhichThereCanBeNoTxHolesInIndex);
--- a/alfresco-search/src/main/java/org/alfresco/solr/query/AbstractAuthoritySetQuery.java
+++ b/alfresco-search/src/main/java/org/alfresco/solr/query/AbstractAuthoritySetQuery.java
@@ -82,15 +82,29 @@ public abstract class AbstractAuthoritySetQuery extends Query
        return authorities.hashCode();
    }

+
+    /*
+    *  This method collects the bitset of documents that match the authorities.
+    */
+
    protected HybridBitSet getACLSet(String[] auths, String field, SolrIndexSearcher searcher) throws IOException
    {
+        /*
+        * Build a query that matches the authorities with a field in the ACL records in the index.
+        */
+
    	BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
        for(String current : auths)
        {
        	queryBuilder.add(new TermQuery(new Term(field, current)), BooleanClause.Occur.SHOULD);
        }

-        //NOTE: this query will be in the filter cache. Ideally it would remain cached throughout the users session.
+
+        /*
+        *   Collect a docset containing the ACL records that match the query.
+        *   This query will be in the filter cache. Ideally it would remain cached throughout the users session.
+        */
+
        DocSet docSet = searcher.getDocSet(queryBuilder.build());

        DocIterator iterator = docSet.iterator();
@@ -102,6 +116,12 @@ public abstract class AbstractAuthoritySetQuery extends Query
        //TODO : makes this configurable. For some systems this is huge and for others not big enough.
        HybridBitSet hybridBitSet = new HybridBitSet(60000000);

+        /*
+        * Collect the ACLID's from the matching acl records.
+        * This is done in a separate step so the initial ACL query can be cached in the FilterCache
+        * The initial ACL query may be expensive if the number of authorities is very large.
+        */
+
        List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves();
        LeafReaderContext context = leaves.get(0);
        NumericDocValues aclValues = DocValuesCache.getNumericDocValues(QueryConstants.FIELD_ACLID, context.reader());
--- a/alfresco-search/src/main/java/org/alfresco/solr/query/AlfrescoFTSQParserPlugin.java
+++ b/alfresco-search/src/main/java/org/alfresco/solr/query/AlfrescoFTSQParserPlugin.java
@@ -81,8 +81,9 @@ public class AlfrescoFTSQParserPlugin extends QParserPlugin
                rerankPhase = RerankPhase.valueOf(arg.toString());
            }

-            //First check the System property.
-            //Then check solrcore.properties, defaulting to the postFilter.
+            /*
+            * This turns on the postFilter
+            */

            postfilter = Boolean.parseBoolean(System.getProperty("alfresco.postfilter",
                                                                 req.getCore().getCoreDescriptor().getCoreProperty("alfresco.postfilter",
@@ -109,7 +110,11 @@ public class AlfrescoFTSQParserPlugin extends QParserPlugin

                if(authset && postfilter)
                {
-                    //Return the PostFilter
+                    /*
+                    * The cost of 200 turns on the postfilter inside Solr
+                    * The postfilter query pulls out all the post filters in the
+                    * query and applies them.
+                    */
                    return new PostFilterQuery(200, query);
                }

--- a/alfresco-search/src/main/java/org/alfresco/solr/query/Solr4QueryParser.java
+++ b/alfresco-search/src/main/java/org/alfresco/solr/query/Solr4QueryParser.java
@@ -563,9 +563,17 @@ public class Solr4QueryParser extends QueryParser implements QueryConstants
                return createReaderSetQuery(queryText);
            } else if (field.equals(FIELD_AUTHORITY))
            {
+                /*
+                * ACL DOCUMENTATION STARTS HERE
+                * This creates the query that applies the ACL filter
+                */
                return createAuthorityQuery(queryText);
            } else if (field.equals(FIELD_AUTHORITYSET))
            {
+                /*
+                * ACL DOCUMENTATION STARTS HERE
+                * This creates the query that applies the ACL filter
+                */
                return createAuthoritySetQuery(queryText);
            } else if (field.equals(FIELD_DENIED))
            {
--- a/alfresco-search/src/main/java/org/alfresco/solr/query/SolrAuthoritySetQuery.java
+++ b/alfresco-search/src/main/java/org/alfresco/solr/query/SolrAuthoritySetQuery.java
@@ -108,6 +108,21 @@ public class SolrAuthoritySetQuery extends AbstractAuthoritySetQuery implements
        }
    }

+    /*
+    *  ACL PostFilter
+    *
+    *  The getFilterCollector function returns a DelegatingCollector
+    *  which is used to filter the documents that match the query.
+    *
+    *  A delegating collector wraps the TopDocs Collector which gathers the top documents that
+    *  match a query. A delegating can filter the documents before "delegating" to the TopDocs
+    *  collector. This filtering process is where the ACL logic is applied.
+    *
+    *  The getFilterCollector method sets up the data structures needed to apply the acl rules.
+    *  These data structures are then passed to the access control collectors.
+    *
+    */
+
    public DelegatingCollector getFilterCollector(IndexSearcher searcher)
    {

@@ -136,7 +151,19 @@ public class SolrAuthoritySetQuery extends AbstractAuthoritySetQuery implements

        try
        {
+
+            /*
+            *  Collect the ACLID's that match the authorities.
+            *  This is done by querying the ACL records in the index. See the method for more
+            *  documentation on this query.
+            */
+
            HybridBitSet aclSet = getACLSet(auths, QueryConstants.FIELD_READER, solrIndexSearcher);
+
+            /*
+            * Collect the documents that the user owns.
+            */
+
            BitsFilter ownerFilter = getOwnerFilter(auths, solrIndexSearcher);

            if (globalReaders.contains(PermissionService.OWNER_AUTHORITY))
@@ -251,6 +278,11 @@ public class SolrAuthoritySetQuery extends AbstractAuthoritySetQuery implements
 		}
    }

+
+    /*
+    *  The AccessControlCollector applies that ACL logic given aclIds and ownerFilter
+    */
+
    class AccessControlCollector extends DelegatingCollector
    {
        private HybridBitSet aclIds;
@@ -276,6 +308,12 @@ public class SolrAuthoritySetQuery extends AbstractAuthoritySetQuery implements
            this.ownerDocs = ownerFilter.getBitSets().get(context.ord);
        }

+        /*
+        * The collect method is applied to each document that matches the
+        * query. The document's aclId must be in the set of aclId's passed into the collector,
+        * or the documents id must be in the ownerDocs.
+        */
+
        public void collect(int doc) throws IOException
        {
            long aclId = this.fieldValues.get(doc);
--- a/alfresco-search/src/main/java/org/alfresco/solr/tracker/AbstractTracker.java
+++ b/alfresco-search/src/main/java/org/alfresco/solr/tracker/AbstractTracker.java
@@ -180,6 +180,11 @@ public abstract class AbstractTracker implements Tracker

        try
        {
+            /*
+            * The runLock ensures that for each tracker type (metadata, content, commit, cascade) only one tracker will
+            * be running at a time.
+            */
+
            runLock.acquire();

            if(state==null && Boolean.parseBoolean(System.getProperty("alfresco.test", "false")))
--- a/alfresco-search/src/main/java/org/alfresco/solr/tracker/CommitTracker.java
+++ b/alfresco-search/src/main/java/org/alfresco/solr/tracker/CommitTracker.java
@@ -131,6 +131,18 @@ public class CommitTracker extends AbstractTracker

            //See if we need a rollback
            if(metadataTracker.getRollback() || aclTracker.getRollback()) {
+
+                /*
+                * The metadataTracker and aclTracker will return true if an unhandled exception has occurred during indexing.
+                *
+                * The doRollback method rolls the index back to the state that it was in at the last commit. This will undo
+                * all the work that has been done by other trackers after the last commit.
+                *
+                * The state of the other trackers is then set to null so the trackers will initialize their state from
+                * the index, rather then the in-memory state. This keeps the trackers in-sync with index if their work is
+                * rolled back.
+                */
+
                doRollback();
                return;
            }
--- a/alfresco-search/src/main/java/org/alfresco/solr/tracker/MetadataTracker.java
+++ b/alfresco-search/src/main/java/org/alfresco/solr/tracker/MetadataTracker.java
@@ -143,6 +143,15 @@ public class MetadataTracker extends AbstractTracker implements Tracker
        if(!isMaster && isSlave)
        {
            // Dynamic registration
+            /*
+            * This section allows Solr's master/slave setup to be used with dynamic shard registration.
+            * In this scenario the slave is polling a "tracking" Solr node. The code below calls
+            * the repo to register the state of the node without pulling any real transactions from the repo.
+            *
+            * This allows the repo to register the replica so that it will be included in queries. But the slave Solr node
+            * will pull its data from a "tracking" Solr node using Solr's master/slave replication, rather then tracking the repository.
+            *
+            */
            
            ShardState shardstate = getShardState();
            client.getTransactions(0L, null, 0L, null, 0, shardstate);
@@ -610,6 +619,11 @@ public class MetadataTracker extends AbstractTracker implements Tracker
        {
            try
            {
+                /*
+                * This write lock is used to lock out the Commit Tracker. The ensures that the MetaDataTracker will
+                * not be indexing content while commits or rollbacks are occurring.
+                */
+
                getWriteLock().acquire();

                /*
@@ -621,6 +635,26 @@ public class MetadataTracker extends AbstractTracker implements Tracker
                this.state = getTrackerState();


+                /*
+                *  The fromCommitTime tells getSomeTransactions() where to start, this actually fairly straight forward.
+                *
+                *  What makes this code so tricky to understand is the state.getTimeToStopIndexing().
+                *
+                *  There are two scenarios to keep in mind:
+                *
+                *  1) Full re-index: In this scenario the state.getTimeToStopIndexing() will never stop the indexing.
+                *
+                *  2) Up-to-date indexing: This is where state.getTimeToStopIndexing() gets interesting. In this scenario
+                *  the Solr index is already up to date with the repo and it is tracking new transactions. The state.getTimeToStopIndexing()
+                *  in this scenario causes the getSomeTransactions() call to stop returning results if it finds a transaction
+                *  beyond a specific point in time. This will break out of this loop and end the tracker run.
+                *
+                *  The next time the metadata tracker runs the "continueState()" method applies the "hole retention"
+                *  to state.getLastGoodTxCommitTimeInIndex(). This causes the state.getLastGoodTxCommitTimeInIndex() to scan
+                *  for prior transactions that might have been missed.
+                *
+                */
+
                Long fromCommitTime = getTxFromCommitTime(txnsFound, state.getLastGoodTxCommitTimeInIndex());
                transactions = getSomeTransactions(txnsFound, fromCommitTime, TIME_STEP_1_HR_IN_MS, 2000,
                                                   state.getTimeToStopIndexing());
@@ -640,6 +674,22 @@ public class MetadataTracker extends AbstractTracker implements Tracker

                ArrayList<Transaction> txBatch = new ArrayList<>();
                for (Transaction info : transactions.getTransactions()) {
+
+                    /*
+                    *  isInIndex is used to ensure transactions that are being re-pulled due to "hole retention" are not re-indexed if
+                    *  they have already been indexed.
+                    *
+                    *  The logic in infoSrv.txnInIndex() first checks an in-memory LRUcache for the txnId. If it doesn't find it in the cache
+                    *  it checks the index. The LRUCache is only needed for txnId's that have been indexed but are not yet visible in the index for
+                    *  one of two reasons:
+                    *
+                    *  1) The commit tracker has not yet committed the transaction.
+                    *  2) The txnId has been committed to the index but the new searcher has not yet been warmed.
+                    *
+                    *  This means that to ensure txnId's are not needlessly reprocessed during hole retention, the LRUCache must be large
+                    *  enough to cover the time between when a txnId is indexed and when it becomes visible.
+                    */
+
                    boolean isInIndex = (infoSrv.txnInIndex(info.getId(), true) && info.getCommitTimeMs() <= state.getLastIndexedTxCommitTime());
                    if (isInIndex) {
                        txnsFound.add(info);