Source Code of org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator$TopCountPQ

package org.apache.lucene.facet.sortedset;


/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;


import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.search.CountFacetRequest;
import org.apache.lucene.facet.search.FacetArrays;
import org.apache.lucene.facet.search.FacetRequest;
import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.FacetsAccumulator;
import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;


/** A {@link FacetsAccumulator} that uses previously
 *  indexed {@link SortedSetDocValuesFacetFields} to perform faceting,
 *  without require a separate taxonomy index.  Faceting is
 *  a bit slower (~25%), and there is added cost on every
 *  {@link IndexReader} open to create a new {@link
 *  SortedSetDocValuesReaderState}.  Furthermore, this does
 *  not support hierarchical facets; only flat (dimension +
 *  label) facets, but it uses quite a bit less RAM to do so. */
public class SortedSetDocValuesAccumulator extends FacetsAccumulator {


  final SortedSetDocValuesReaderState state;
  final SortedSetDocValues dv;
  final String field;
  final FacetArrays facetArrays;
  
  /** Constructor with the given facet search params. */
  public SortedSetDocValuesAccumulator(SortedSetDocValuesReaderState state, FacetSearchParams fsp) 
      throws IOException {
    this(state, fsp, null);
  }
  
  public SortedSetDocValuesAccumulator(SortedSetDocValuesReaderState state, FacetSearchParams fsp, FacetArrays arrays) 
      throws IOException {
    super(fsp);
    this.state = state;
    this.field = state.getField();
    this.facetArrays = arrays == null ? new FacetArrays(state.getSize()) : arrays;
    dv = state.getDocValues();


    // Check params:
    for (FacetRequest fr : fsp.facetRequests) {
      if (!(fr instanceof CountFacetRequest)) {
        throw new IllegalArgumentException("this accumulator only supports CountFacetRequest; got " + fr);
      }
      if (fr.categoryPath.length != 1) {
        throw new IllegalArgumentException("this accumulator only supports 1-level CategoryPath; got " + fr.categoryPath);
      }
      if (fr.getDepth() != 1) {
        throw new IllegalArgumentException("this accumulator only supports depth=1; got " + fr.getDepth());
      }
      String dim = fr.categoryPath.components[0];


      SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim);
      if (ordRange == null) {
        throw new IllegalArgumentException("dim \"" + dim + "\" does not exist");
      }
    }
  }


  /** Keeps highest count results. */
  static class TopCountPQ extends PriorityQueue<FacetResultNode> {
    public TopCountPQ(int topN) {
      super(topN, false);
    }


    @Override
    protected boolean lessThan(FacetResultNode a, FacetResultNode b) {
      if (a.value < b.value) {
        return true;
      } else if (a.value > b.value) {
        return false;
      } else {
        return a.ordinal > b.ordinal;
      }
    }
  }


  static class SortedSetAggregator {


    private final SortedSetDocValuesReaderState state;
    private final String field;
    private final SortedSetDocValues dv;
    
    public SortedSetAggregator(String field, SortedSetDocValuesReaderState state, SortedSetDocValues dv) {
      this.field = field;
      this.state = state;
      this.dv = dv;
    }
    
    public void aggregate(MatchingDocs matchingDocs, FacetArrays facetArrays) throws IOException {


      AtomicReader reader = matchingDocs.context.reader();


      // LUCENE-5090: make sure the provided reader context "matches"
      // the top-level reader passed to the
      // SortedSetDocValuesReaderState, else cryptic
      // AIOOBE can happen:
      if (ReaderUtil.getTopLevelContext(matchingDocs.context).reader() != state.origReader) {
        throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader");
      }
      
      SortedSetDocValues segValues = reader.getSortedSetDocValues(field);
      if (segValues == null) {
        return;
      }


      final int[] counts = facetArrays.getIntArray();
      final int maxDoc = reader.maxDoc();
      assert maxDoc == matchingDocs.bits.length();


      if (dv instanceof MultiSortedSetDocValues) {
        MultiDocValues.OrdinalMap ordinalMap = ((MultiSortedSetDocValues) dv).mapping;
        int segOrd = matchingDocs.context.ord;


        int numSegOrds = (int) segValues.getValueCount();


        if (matchingDocs.totalHits < numSegOrds/10) {
          // Remap every ord to global ord as we iterate:
          int doc = 0;
          while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
            segValues.setDocument(doc);
            int term = (int) segValues.nextOrd();
            while (term != SortedSetDocValues.NO_MORE_ORDS) {
              counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++;
              term = (int) segValues.nextOrd();
            }
            ++doc;
          }
        } else {


          // First count in seg-ord space:
          final int[] segCounts = new int[numSegOrds];
          int doc = 0;
          while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
            segValues.setDocument(doc);
            int term = (int) segValues.nextOrd();
            while (term != SortedSetDocValues.NO_MORE_ORDS) {
              segCounts[term]++;
              term = (int) segValues.nextOrd();
            }
            ++doc;
          }


          // Then, migrate to global ords:
          for(int ord=0;ord<numSegOrds;ord++) {
            int count = segCounts[ord];
            if (count != 0) {
              counts[(int) ordinalMap.getGlobalOrd(segOrd, ord)] += count;
            }
          }
        }
      } else {
        // No ord mapping (e.g., single segment index):
        // just aggregate directly into counts:


        int doc = 0;
        while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
          segValues.setDocument(doc);
          int term = (int) segValues.nextOrd();
          while (term != SortedSetDocValues.NO_MORE_ORDS) {
            counts[term]++;
            term = (int) segValues.nextOrd();
          }
          ++doc;
        }
      }
    }


  }
  
  @Override
  public List<FacetResult> accumulate(List<MatchingDocs> matchingDocs) throws IOException {


    SortedSetAggregator aggregator = new SortedSetAggregator(field, state, dv);
    for (MatchingDocs md : matchingDocs) {
      aggregator.aggregate(md, facetArrays);
    }


    // compute top-K
    List<FacetResult> results = new ArrayList<FacetResult>();


    int[] counts = facetArrays.getIntArray();


    BytesRef scratch = new BytesRef();


    for (FacetRequest request : searchParams.facetRequests) {
      String dim = request.categoryPath.components[0];
      SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim);
      // checked in ctor:
      assert ordRange != null;


      if (request.numResults >= ordRange.end - ordRange.start + 1) {
        // specialize this case, user is interested in all available results
        ArrayList<FacetResultNode> nodes = new ArrayList<FacetResultNode>();
        int dimCount = 0;
        for(int ord=ordRange.start; ord<=ordRange.end; ord++) {
          //System.out.println("  ord=" + ord + " count= "+ counts[ord] + " bottomCount=" + bottomCount);
          if (counts[ord] != 0) {
            dimCount += counts[ord];
            FacetResultNode node = new FacetResultNode(ord, counts[ord]);
            dv.lookupOrd(ord, scratch);
            node.label = new CategoryPath(scratch.utf8ToString().split(state.separatorRegex, 2));
            nodes.add(node);
          }
        }


        Collections.sort(nodes, new Comparator<FacetResultNode>() {
            @Override
            public int compare(FacetResultNode o1, FacetResultNode o2) {
              // First by highest count
              int value = (int) (o2.value - o1.value);
              if (value == 0) {
                // ... then by lowest ord:
                value = o1.ordinal - o2.ordinal;
              }
              return value;
            }
          });
      
        CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim);
        if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) {
          dimCount = 0;
        }


        FacetResultNode rootNode = new FacetResultNode(-1, dimCount);
        rootNode.label = new CategoryPath(new String[] {dim});
        rootNode.subResults = nodes;
        results.add(new FacetResult(request, rootNode, nodes.size()));
        continue;
      }


      TopCountPQ q = new TopCountPQ(request.numResults);


      int bottomCount = 0;


      //System.out.println("collect");
      int dimCount = 0;
      int childCount = 0;
      FacetResultNode reuse = null;
      for(int ord=ordRange.start; ord<=ordRange.end; ord++) {
        //System.out.println("  ord=" + ord + " count= "+ counts[ord] + " bottomCount=" + bottomCount);
        if (counts[ord] > 0) {
          childCount++;
          if (counts[ord] > bottomCount) {
            dimCount += counts[ord];
            //System.out.println("    keep");
            if (reuse == null) {
              reuse = new FacetResultNode(ord, counts[ord]);
            } else {
              reuse.ordinal = ord;
              reuse.value = counts[ord];
            }
            reuse = q.insertWithOverflow(reuse);
            if (q.size() == request.numResults) {
              bottomCount = (int) q.top().value;
              //System.out.println("    new bottom=" + bottomCount);
            }
          }
        }
      }


      CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim);
      if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) {
        dimCount = 0;
      }


      FacetResultNode rootNode = new FacetResultNode(-1, dimCount);
      rootNode.label = new CategoryPath(new String[] {dim});


      FacetResultNode[] childNodes = new FacetResultNode[q.size()];
      for(int i=childNodes.length-1;i>=0;i--) {
        childNodes[i] = q.pop();
        dv.lookupOrd(childNodes[i].ordinal, scratch);
        childNodes[i].label = new CategoryPath(scratch.utf8ToString().split(state.separatorRegex, 2));
      }
      rootNode.subResults = Arrays.asList(childNodes);
      
      results.add(new FacetResult(request, rootNode, childCount));
    }


    return results;
  }
  
  @Override
  public boolean requiresDocScores() {
    return false;
  }
  
}
Source Code of org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator$TopCountPQ

Related Classes of org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator$TopCountPQ