alfresco-community-repo/source/java/org/apache/lucene/index/TermInfosReader.java

/*
 * #%L
 * Alfresco Repository
 * %%
 * Copyright (C) 2005 - 2016 Alfresco Software Limited
 * %%
 * This file is part of the Alfresco software.
 * If the software was purchased under a paid Alfresco license, the terms of
 * the paid license agreement will prevail.  Otherwise, the software is
 * provided under the following open source license terms:
 *
 * Alfresco is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Alfresco is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 * #L%
 */
package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.util.cache.Cache;
import org.apache.lucene.util.cache.SimpleLRUCache;
import org.apache.lucene.util.CloseableThreadLocal;

/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
 * Directory.  Pairs are accessed either by Term or by ordinal position the
 * set.  */

final class TermInfosReader {
  private Directory directory;
  private String segment;
  private FieldInfos fieldInfos;

  private CloseableThreadLocal threadResources = new CloseableThreadLocal();
  private SegmentTermEnum origEnum;
  private long size;

  private Term[] indexTerms = null;
  private ReentrantReadWriteLock indexTermsLock = new ReentrantReadWriteLock();
  private TermInfo[] indexInfos;
  private long[] indexPointers;

  private SegmentTermEnum indexEnum;

  private int indexDivisor = 1;
  private int totalIndexInterval;

  private final static int DEFAULT_CACHE_SIZE = 1024;

  /**
   * Per-thread resources managed by ThreadLocal
   */
  private static final class ThreadResources {
    SegmentTermEnum termEnum;

    // Used for caching the least recently looked-up Terms
    Cache termInfoCache;
  }

  TermInfosReader(Directory dir, String seg, FieldInfos fis)
       throws CorruptIndexException, IOException {
    this(dir, seg, fis, BufferedIndexInput.BUFFER_SIZE);
  }

  TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize)
       throws CorruptIndexException, IOException {
    boolean success = false;

    try {
      directory = dir;
      segment = seg;
      fieldInfos = fis;

      origEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_EXTENSION,
          readBufferSize), fieldInfos, false);
      size = origEnum.size;
      totalIndexInterval = origEnum.indexInterval;

      indexEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION,
          readBufferSize), fieldInfos, true);

      success = true;
    } finally {
      // With lock-less commits, it's entirely possible (and
      // fine) to hit a FileNotFound exception above. In
      // this case, we want to explicitly close any subset
      // of things that were opened so that we don't have to
      // wait for a GC to do so.
      if (!success) {
        close();
      }
    }
  }

  public int getSkipInterval() {
    return origEnum.skipInterval;
  }

  public int getMaxSkipLevels() {
    return origEnum.maxSkipLevels;
  }

  /**
   * <p>Sets the indexDivisor, which subsamples the number
   * of indexed terms loaded into memory.  This has a
   * similar effect as {@link
   * IndexWriter#setTermIndexInterval} except that setting
   * must be done at indexing time while this setting can be
   * set per reader.  When set to N, then one in every
   * N*termIndexInterval terms in the index is loaded into
   * memory.  By setting this to a value > 1 you can reduce
   * memory usage, at the expense of higher latency when
   * loading a TermInfo.  The default value is 1.</p>
   *
   * <b>NOTE:</b> you must call this before the term
   * index is loaded.  If the index is already loaded,
   * an IllegalStateException is thrown.
   *
   + @throws IllegalStateException if the term index has
   * already been loaded into memory.
   */
  public void setIndexDivisor(int indexDivisor) throws IllegalStateException {
    if (indexDivisor < 1)
      throw new IllegalArgumentException("indexDivisor must be > 0: got " + indexDivisor);

    if (indexTerms != null)
      throw new IllegalStateException("index terms are already loaded");

    this.indexDivisor = indexDivisor;
    totalIndexInterval = origEnum.indexInterval * indexDivisor;
  }

  /** Returns the indexDivisor.
   * @see #setIndexDivisor
   */
  public int getIndexDivisor() {
    return indexDivisor;
  }

  final void close() throws IOException {
    if (origEnum != null)
      origEnum.close();
    if (indexEnum != null)
      indexEnum.close();
    threadResources.close();
  }

  /** Returns the number of term/value pairs in the set. */
  final long size() {
    return size;
  }

  private ThreadResources getThreadResources() {
    ThreadResources resources = (ThreadResources)threadResources.get();
    if (resources == null) {
      resources = new ThreadResources();
      resources.termEnum = terms();
      // Cache does not have to be thread-safe, it is only used by one thread at the same time
      resources.termInfoCache = new SimpleLRUCache(DEFAULT_CACHE_SIZE);
      threadResources.set(resources);
    }
    return resources;
  }

  private void ensureIndexIsRead() throws IOException {
    indexTermsLock.readLock().lock();
    try {
      if (indexTerms != null) {                                  // index already read
        return;                                                  // do nothing
      }
    }
    finally {
        indexTermsLock.readLock().unlock();
    }
    indexTermsLock.writeLock().lock();
    try {
      if (indexTerms != null) {
        return;
      }
      int indexSize = 1+((int)indexEnum.size-1)/indexDivisor;  // otherwise read index

      indexTerms = new Term[indexSize];
      indexInfos = new TermInfo[indexSize];
      indexPointers = new long[indexSize];

      for (int i = 0; indexEnum.next(); i++) {
        indexTerms[i] = indexEnum.term();
        indexInfos[i] = indexEnum.termInfo();
        indexPointers[i] = indexEnum.indexPointer;

        for (int j = 1; j < indexDivisor; j++)
            if (!indexEnum.next())
                break;
      }
    } finally {
        if (indexEnum != null) {
          indexEnum.close();
          indexEnum = null;
        }
        indexTermsLock.writeLock().unlock();
    }
  }

  /** Returns the offset of the greatest index entry which is less than or equal to term.*/
  private final int getIndexOffset(Term term) {
    int lo = 0;                   // binary search indexTerms[]
    int hi = indexTerms.length - 1;

    while (hi >= lo) {
      int mid = (lo + hi) >>> 1;
      int delta = term.compareTo(indexTerms[mid]);
      if (delta < 0)
    hi = mid - 1;
      else if (delta > 0)
    lo = mid + 1;
      else
    return mid;
    }
    return hi;
  }

  private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
    enumerator.seek(indexPointers[indexOffset],
                   (indexOffset * totalIndexInterval) - 1,
                   indexTerms[indexOffset], indexInfos[indexOffset]);
  }

  /** Returns the TermInfo for a Term in the set, or null. */
  TermInfo get(Term term) throws IOException {
    return get(term, true);
  }

  /** Returns the TermInfo for a Term in the set, or null. */
  private TermInfo get(Term term, boolean useCache) throws IOException {
    if (size == 0) return null;

    ensureIndexIsRead();

    TermInfo ti;
    ThreadResources resources = getThreadResources();
    Cache cache = null;

    if (useCache) {
      cache = resources.termInfoCache;
      // check the cache first if the term was recently looked up
      ti = (TermInfo) cache.get(term);
      if (ti != null) {
        return ti;
      }
    }

    // optimize sequential access: first try scanning cached enum w/o seeking
    SegmentTermEnum enumerator = resources.termEnum;
    if (enumerator.term() != null                 // term is at or past current
    && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
        || term.compareTo(enumerator.term()) >= 0)) {
      int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
      if (indexTerms.length == enumOffset     // but before end of block
    || term.compareTo(indexTerms[enumOffset]) < 0) {
       // no need to seek

        int numScans = enumerator.scanTo(term);
        if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
          ti = enumerator.termInfo();
          if (cache != null && numScans > 1) {
            // we only  want to put this TermInfo into the cache if
            // scanEnum skipped more than one dictionary entry.
            // This prevents RangeQueries or WildcardQueries to
            // wipe out the cache when they iterate over a large numbers
            // of terms in order
            cache.put(term, ti);
          }
        } else {
          ti = null;
        }

        return ti;
      }
    }

    // random-access: must seek
    seekEnum(enumerator, getIndexOffset(term));
    enumerator.scanTo(term);
    if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
      ti = enumerator.termInfo();
      if (cache != null) {
        cache.put(term, ti);
      }
    } else {
      ti = null;
    }
    return ti;
  }

  /** Returns the nth term in the set. */
  final Term get(int position) throws IOException {
    if (size == 0) return null;

    SegmentTermEnum enumerator = getThreadResources().termEnum;
    if (enumerator != null && enumerator.term() != null &&
        position >= enumerator.position &&
    position < (enumerator.position + totalIndexInterval))
      return scanEnum(enumerator, position);      // can avoid seek

    seekEnum(enumerator, position/totalIndexInterval); // must seek
    return scanEnum(enumerator, position);
  }

  private final Term scanEnum(SegmentTermEnum enumerator, int position) throws IOException {
    while(enumerator.position < position)
      if (!enumerator.next())
    return null;

    return enumerator.term();
  }

  /** Returns the position of a Term in the set or -1. */
  final long getPosition(Term term) throws IOException {
    if (size == 0) return -1;

    ensureIndexIsRead();
    int indexOffset = getIndexOffset(term);

    SegmentTermEnum enumerator = getThreadResources().termEnum;
    seekEnum(enumerator, indexOffset);

    while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}

    if (term.compareTo(enumerator.term()) == 0)
      return enumerator.position;
    else
      return -1;
  }

  /** Returns an enumeration of all the Terms and TermInfos in the set. */
  public SegmentTermEnum terms() {
    return (SegmentTermEnum)origEnum.clone();
  }

  /** Returns an enumeration of terms starting at or after the named term. */
  public SegmentTermEnum terms(Term term) throws IOException {
    // don't use the cache in this call because we want to reposition the
    // enumeration
    get(term, false);
    return (SegmentTermEnum)getThreadResources().termEnum.clone();
  }
}