Source Code of com.greplin.lucene.filter.PhraseFilter

/*
 * Copyright 2013 The greplin-lucene-utils Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package com.greplin.lucene.filter;


import com.google.common.base.Objects;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;
import com.greplin.lucene.index.IndexReaders;
import com.greplin.lucene.util.AllDocsIntersectionProvider;
import com.greplin.lucene.util.Intersection;
import com.greplin.lucene.util.IntersectionProvider;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.FixedBitSet;


import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.SortedSet;


/**
 * Filters for documents matching a phrase.
 *
 * Differences from PhraseQuery:
 * - faster with less features
 * - supports a single field only
 * - does not compute score
 * - does not support slop
 *
 * Additional features:
 * - supports AND type intersection queries, this will perform
 *   much better than an external BooleanFilter
 *
 * Optimization notes:
 * - Using a shared TermPositions with seeking saves about 10% !
 */
public class PhraseFilter extends Filter {


  /**
   * The terms comprising the phrase.
   */
  private final Term[] terms;




  /**
   * The intersection provider.
   */
  private final IntersectionProvider intersectionProvider;




  /**
   * Construct a new phrase filter.
   * @param intersectionProvider other doc id set to intersect with
   * @param terms the terms in the phrase
   */
  public PhraseFilter(
      final IntersectionProvider intersectionProvider, final Term... terms) {
    this.terms = Arrays.copyOf(terms, terms.length);
    this.intersectionProvider = intersectionProvider;
  }




  /**
   * Construct a new phrase filter.
   * @param terms the terms in the phrase
   */
  public PhraseFilter(final Term... terms) {
    this(AllDocsIntersectionProvider.INSTANCE, terms);
  }




  /**
   * Construct a new phrase filter.
   * @param field the field to find phrases in
   * @param terms the terms in the phrase
   */
  public PhraseFilter(final String field, final String... terms) {
    this(convertToTerms(field, terms));
  }




  /**
   * Construct a new phrase filter.
   * @param intersectionProvider other doc id set to intersect with
   * @param field the field to find phrases in
   * @param terms the terms in the phrase
   */
  public PhraseFilter(
      final IntersectionProvider intersectionProvider,
      final String field,
      final String... terms) {
    this(intersectionProvider, convertToTerms(field, terms));
  }




  /**
   * Internal utility method that converts a field name and set of values to
   * an array of terms.
   * @param field the field
   * @param values the values
   * @return array of terms, one per value, each with the given field
   */
  private static Term[] convertToTerms(
      final String field, final String... values) {
    Term[] terms = new Term[values.length];
    for (int i = 0; i < values.length; i++) {
      terms[i] = new Term(field, values[i]);
    }
    return terms;
  }




  @Override
  public boolean equals(final Object o) {
    if (this == o) {
      return true;
    }
    if (o == null || getClass() != o.getClass()) {
      return false;
    }


    PhraseFilter that = (PhraseFilter) o;
    return this.intersectionProvider.equals(that.intersectionProvider)
        && Arrays.equals(this.terms, that.terms);
  }




  @Override
  public int hashCode() {
    return Objects.hashCode(
        Arrays.hashCode(this.terms), this.intersectionProvider);
  }




  @Override
  public String toString() {
    return "PhraseFilter{"
        + "terms=" + Arrays.toString(this.terms)
        + ", intersectionProvider=" + this.intersectionProvider
        + '}';
  }




  @Override
  public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
    List<IndexReader> subReaders = IndexReaders.gatherSubReaders(reader);
    PhraseFilterMatchList[] results =
        new PhraseFilterMatchList[subReaders.size()];
    int matchCount = 0;
    int readerNumber = 0;


    for (IndexReader subReader : subReaders) {
      SortedSet<TermWithFrequency> termsOrderedByFrequency = Sets.newTreeSet();
      for (int i = 0; i < this.terms.length; i++) {
        Term t = this.terms[i];
        termsOrderedByFrequency.add(
            new TermWithFrequency(t, subReader.docFreq(t), i));
      }


      PhraseFilterMatchList matches = null;
      TermPositions termPositions = subReader.termPositions();
      try {
        for (TermWithFrequency term : termsOrderedByFrequency) {
          if (term.docFreq == 0) {
            break;
          }


          termPositions.seek(term.term);


          if (matches == null) {
            // If this is the first term, collect all matches that intersect
            // with the provided initial document set.
            Intersection intersection = this.intersectionProvider.get(reader);


            matches = new PhraseFilterMatchList(term.docFreq);
            while (intersection.advanceToNextIntersection(termPositions)) {
              int freq = termPositions.freq();
              PhraseFilterIntList list = new PhraseFilterIntList(freq);
              for (int i = 0; i < freq; i++) {
                list.add(termPositions.nextPosition() - term.offset);
              }
              matches.add(termPositions.doc(), list);
            }
          } else {
            // Otherwise, intersect with the existing matches.
            matches.intersect(termPositions, term.offset);
          }


          if (matches.getCount() == 0) {
            break;
          }
        }
      } finally {
        termPositions.close();
      }


      if (matches != null) {
        results[readerNumber] = matches;
        matchCount += matches.getCount();
      }
      readerNumber++;
    }


    final int bitsPerIntPowerLogTwo = 5; // 2^5 = 32
    if (matchCount > reader.maxDoc() >> bitsPerIntPowerLogTwo) {
      FixedBitSet result = new FixedBitSet(reader.maxDoc());
      int readerOffset = 0;
      for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
        PhraseFilterMatchList matches = results[readerIndex];
        if (matches != null) {
          int count = matches.getCount();
          int[] docIds = matches.getDocIds();
          for (int i = 0; i < count; i++) {
            result.set(docIds[i] + readerOffset);
          }
        }
        readerOffset += subReaders.get(readerIndex).maxDoc();
      }
      return result;
    } else if (matchCount == 0) {
      return DocIdSets.EMPTY;
    } else {
      int[] result = new int[matchCount];
      int base = 0;
      int readerOffset = 0;
      for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
        PhraseFilterMatchList matches = results[readerIndex];
        if (matches != null) {
          int count = matches.getCount();
          int[] docIds = matches.getDocIds();
          for (int i = 0; i < count; i++) {
            result[base + i] = docIds[i] + readerOffset;
          }
          base += count;
        }
        readerOffset += subReaders.get(readerIndex).maxDoc();
      }
      return new SortedIntArrayDocIdSet(result);
    }
  }




  /**
   * DocId set based on a sorted array of integers.
   * The integer array is not defensively copied - so don't modify it!
   */
  private static final class SortedIntArrayDocIdSet extends DocIdSet {


    /**
     * The sorted array of integers.
     */
    private final int[] ints;




    /**
     * Constructs a new doc id set.
     * @param ints sorted array of integers
     */
    private SortedIntArrayDocIdSet(final int[] ints) {
      this.ints = ints;
    }




    @Override
    public DocIdSetIterator iterator() throws IOException {
      return new SortedIntArrayDocIdSetIterator(this.ints);
    }




    @Override
    public boolean isCacheable() {
      return true;
    }


  }




  /**
   * Iterator for sorted integer array.
   */
  private static final class SortedIntArrayDocIdSetIterator
      extends DocIdSetIterator {


    /**
     * The list of integers.
     */
    private final int[] ints;


    /**
     * The active index.
     */
    private int index = -1;




    /**
     * Constructs an iterator over a sorted integer array.
     * @param ints the array of integers
     */
    private SortedIntArrayDocIdSetIterator(final int[] ints) {
      this.ints = ints;
    }




    @Override
    public int docID() {
      return this.index < this.ints.length
          ? this.ints[this.index] : NO_MORE_DOCS;
    }




    @Override
    public int nextDoc() throws IOException {
      ++this.index;
      return this.index < this.ints.length
          ? this.ints[this.index] : NO_MORE_DOCS;
    }




    @Override
    public int advance(final int target) throws IOException {
      // TODO(robbyw): Consider doing binary search here.
      // Though, in practice, the array is probably small enough that this
      // would actually be slower.
      while (docID() < target) {
        nextDoc();
      }
      return docID();
    }


  }




  /**
   * A term with a frequency and offset.
   */
  private static final class TermWithFrequency
      implements Comparable<TermWithFrequency> {


    /**
     * The term.
     */
    private final Term term;


    /**
     * Its frequency.
     */
    private final int docFreq;


    /**
     * Offset within the phrase.
     */
    private final int offset;




    /**
     * Construct a term with frequency struct.
     * @param term the term
     * @param docFreq its frequency
     * @param offset offset within the phrase
     */
    private TermWithFrequency(
        final Term term, final int docFreq, final int offset) {
      this.term = term;
      this.docFreq = docFreq;
      this.offset = offset;
    }


    @Override
    public int compareTo(final TermWithFrequency o) {
      int first = Ints.compare(this.docFreq, o.docFreq);
      return first == 0 ? Ints.compare(this.offset, o.offset) : first;
    }


  }


}
Source Code of com.greplin.lucene.filter.PhraseFilter

Related Classes of com.greplin.lucene.filter.PhraseFilter