Package org.apache.lucene.facet.sortedset

Source Code of org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator$TopCountPQ

package org.apache.lucene.facet.sortedset;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.search.CountFacetRequest;
import org.apache.lucene.facet.search.FacetArrays;
import org.apache.lucene.facet.search.FacetRequest;
import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.FacetsAccumulator;
import org.apache.lucene.facet.search.FacetsAggregator;
import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;

/** A {@link FacetsAccumulator} that uses previously
*  indexed {@link SortedSetDocValuesFacetFields} to perform faceting,
*  without require a separate taxonomy index.  Faceting is
*  a bit slower (~25%), and there is added cost on every
{@link IndexReader} open to create a new {@link
*  SortedSetDocValuesReaderState}.  Furthermore, this does
*  not support hierarchical facets; only flat (dimension +
*  label) facets, but it uses quite a bit less RAM to do so. */
public class SortedSetDocValuesAccumulator extends FacetsAccumulator {

  final SortedSetDocValuesReaderState state;
  final SortedSetDocValues dv;
  final String field;

  public SortedSetDocValuesAccumulator(FacetSearchParams fsp, SortedSetDocValuesReaderState state) throws IOException {
    super(fsp, null, null, new FacetArrays(state.getSize()));
    this.state = state;
    this.field = state.getField();
    dv = state.getDocValues();

    // Check params:
    for(FacetRequest request : fsp.facetRequests) {
      if (!(request instanceof CountFacetRequest)) {
        throw new IllegalArgumentException("this collector only supports CountFacetRequest; got " + request);
      }
      if (request.categoryPath.length != 1) {
        throw new IllegalArgumentException("this collector only supports depth 1 CategoryPath; got " + request.categoryPath);
      }
      if (request.getDepth() != 1) {
        throw new IllegalArgumentException("this collector only supports depth=1; got " + request.getDepth());
      }
      String dim = request.categoryPath.components[0];

      SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim);
      if (ordRange == null) {
        throw new IllegalArgumentException("dim \"" + dim + "\" does not exist");
      }
    }
  }

  @Override
  public FacetsAggregator getAggregator() {

    return new FacetsAggregator() {

      @Override
      public void aggregate(MatchingDocs matchingDocs, CategoryListParams clp, FacetArrays facetArrays) throws IOException {

        AtomicReader reader = matchingDocs.context.reader();

        // LUCENE-5090: make sure the provided reader context "matches"
        // the top-level reader passed to the
        // SortedSetDocValuesReaderState, else cryptic
        // AIOOBE can happen:
        if (ReaderUtil.getTopLevelContext(matchingDocs.context).reader() != state.origReader) {
          throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader");
        }
       
        SortedSetDocValues segValues = reader.getSortedSetDocValues(field);
        if (segValues == null) {
          return;
        }

        final int[] counts = facetArrays.getIntArray();
        final int maxDoc = reader.maxDoc();
        assert maxDoc == matchingDocs.bits.length();

        if (dv instanceof MultiSortedSetDocValues) {
          MultiDocValues.OrdinalMap ordinalMap = ((MultiSortedSetDocValues) dv).mapping;
          int segOrd = matchingDocs.context.ord;

          int numSegOrds = (int) segValues.getValueCount();

          if (matchingDocs.totalHits < numSegOrds/10) {
            // Remap every ord to global ord as we iterate:
            int doc = 0;
            while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
              segValues.setDocument(doc);
              int term = (int) segValues.nextOrd();
              while (term != SortedSetDocValues.NO_MORE_ORDS) {
                counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++;
                term = (int) segValues.nextOrd();
              }
              ++doc;
            }
          } else {

            // First count in seg-ord space:
            final int[] segCounts = new int[numSegOrds];
            int doc = 0;
            while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
              segValues.setDocument(doc);
              int term = (int) segValues.nextOrd();
              while (term != SortedSetDocValues.NO_MORE_ORDS) {
                segCounts[term]++;
                term = (int) segValues.nextOrd();
              }
              ++doc;
            }

            // Then, migrate to global ords:
            for(int ord=0;ord<numSegOrds;ord++) {
              int count = segCounts[ord];
              if (count != 0) {
                counts[(int) ordinalMap.getGlobalOrd(segOrd, ord)] += count;
              }
            }
          }
        } else {
          // No ord mapping (e.g., single segment index):
          // just aggregate directly into counts:

          int doc = 0;
          while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
            segValues.setDocument(doc);
            int term = (int) segValues.nextOrd();
            while (term != SortedSetDocValues.NO_MORE_ORDS) {
              counts[term]++;
              term = (int) segValues.nextOrd();
            }
            ++doc;
          }
        }
      }

      @Override
      public void rollupValues(FacetRequest fr, int ordinal, int[] children, int[] siblings, FacetArrays facetArrays) {
        // Nothing to do here: we only support flat (dim +
        // label) facets, and in accumulate we sum up the
        // count for the dimension.
      }

      @Override
      public boolean requiresDocScores() {
        return false;
      }
    };
  }

  /** Keeps highest count results. */
  static class TopCountPQ extends PriorityQueue<FacetResultNode> {
    public TopCountPQ(int topN) {
      super(topN, false);
    }

    @Override
    protected boolean lessThan(FacetResultNode a, FacetResultNode b) {
      if (a.value < b.value) {
        return true;
      } else if (a.value > b.value) {
        return false;
      } else {
        return a.ordinal > b.ordinal;
      }
    }
  }

  @Override
  public List<FacetResult> accumulate(List<MatchingDocs> matchingDocs) throws IOException {

    FacetsAggregator aggregator = getAggregator();
    for (CategoryListParams clp : getCategoryLists()) {
      for (MatchingDocs md : matchingDocs) {
        aggregator.aggregate(md, clp, facetArrays);
      }
    }

    // compute top-K
    List<FacetResult> results = new ArrayList<FacetResult>();

    int[] counts = facetArrays.getIntArray();

    BytesRef scratch = new BytesRef();

    for(FacetRequest request : searchParams.facetRequests) {
      String dim = request.categoryPath.components[0];
      SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim);
      // checked in ctor:
      assert ordRange != null;

      if (request.numResults >= ordRange.end - ordRange.start + 1) {
        // specialize this case, user is interested in all available results
        ArrayList<FacetResultNode> nodes = new ArrayList<FacetResultNode>();
        int dimCount = 0;
        for(int ord=ordRange.start; ord<=ordRange.end; ord++) {
          //System.out.println("  ord=" + ord + " count= "+ counts[ord] + " bottomCount=" + bottomCount);
          if (counts[ord] != 0) {
            dimCount += counts[ord];
            FacetResultNode node = new FacetResultNode(ord, counts[ord]);
            dv.lookupOrd(ord, scratch);
            node.label = new CategoryPath(scratch.utf8ToString().split(state.separatorRegex, 2));
            nodes.add(node);
          }
        }

        Collections.sort(nodes, new Comparator<FacetResultNode>() {
            @Override
            public int compare(FacetResultNode o1, FacetResultNode o2) {
              // First by highest count
              int value = (int) (o2.value - o1.value);
              if (value == 0) {
                // ... then by lowest ord:
                value = o1.ordinal - o2.ordinal;
              }
              return value;
            }
          });
     
        CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim);
        if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) {
          dimCount = 0;
        }

        FacetResultNode rootNode = new FacetResultNode(-1, dimCount);
        rootNode.label = new CategoryPath(new String[] {dim});
        rootNode.subResults = nodes;
        results.add(new FacetResult(request, rootNode, nodes.size()));
        continue;
      }

      TopCountPQ q = new TopCountPQ(request.numResults);

      int bottomCount = 0;

      //System.out.println("collect");
      int dimCount = 0;
      int childCount = 0;
      FacetResultNode reuse = null;
      for(int ord=ordRange.start; ord<=ordRange.end; ord++) {
        //System.out.println("  ord=" + ord + " count= "+ counts[ord] + " bottomCount=" + bottomCount);
        if (counts[ord] > 0) {
          childCount++;
          if (counts[ord] > bottomCount) {
            dimCount += counts[ord];
            //System.out.println("    keep");
            if (reuse == null) {
              reuse = new FacetResultNode(ord, counts[ord]);
            } else {
              reuse.ordinal = ord;
              reuse.value = counts[ord];
            }
            reuse = q.insertWithOverflow(reuse);
            if (q.size() == request.numResults) {
              bottomCount = (int) q.top().value;
              //System.out.println("    new bottom=" + bottomCount);
            }
          }
        }
      }

      CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim);
      if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) {
        dimCount = 0;
      }

      FacetResultNode rootNode = new FacetResultNode(-1, dimCount);
      rootNode.label = new CategoryPath(new String[] {dim});

      FacetResultNode[] childNodes = new FacetResultNode[q.size()];
      for(int i=childNodes.length-1;i>=0;i--) {
        childNodes[i] = q.pop();
        dv.lookupOrd(childNodes[i].ordinal, scratch);
        childNodes[i].label = new CategoryPath(scratch.utf8ToString().split(state.separatorRegex, 2));
      }
      rootNode.subResults = Arrays.asList(childNodes);
     
      results.add(new FacetResult(request, rootNode, childCount));
    }

    return results;
  }
}
TOP

Related Classes of org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator$TopCountPQ

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.