Package org.apache.lucene.facet.sortedset

Source Code of org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator$TopCountPQ

package org.apache.lucene.facet.sortedset;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.search.CountFacetRequest;
import org.apache.lucene.facet.search.FacetArrays;
import org.apache.lucene.facet.search.FacetRequest;
import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.FacetsAccumulator;
import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;

/** A {@link FacetsAccumulator} that uses previously
*  indexed {@link SortedSetDocValuesFacetFields} to perform faceting,
*  without require a separate taxonomy index.  Faceting is
*  a bit slower (~25%), and there is added cost on every
{@link IndexReader} open to create a new {@link
*  SortedSetDocValuesReaderState}.  Furthermore, this does
*  not support hierarchical facets; only flat (dimension +
*  label) facets, but it uses quite a bit less RAM to do so. */
public class SortedSetDocValuesAccumulator extends FacetsAccumulator {

  final SortedSetDocValuesReaderState state;
  final SortedSetDocValues dv;
  final String field;
  final FacetArrays facetArrays;
 
  /** Constructor with the given facet search params. */
  public SortedSetDocValuesAccumulator(SortedSetDocValuesReaderState state, FacetSearchParams fsp)
      throws IOException {
    this(state, fsp, null);
  }
 
  public SortedSetDocValuesAccumulator(SortedSetDocValuesReaderState state, FacetSearchParams fsp, FacetArrays arrays)
      throws IOException {
    super(fsp);
    this.state = state;
    this.field = state.getField();
    this.facetArrays = arrays == null ? new FacetArrays(state.getSize()) : arrays;
    dv = state.getDocValues();

    // Check params:
    for (FacetRequest fr : fsp.facetRequests) {
      if (!(fr instanceof CountFacetRequest)) {
        throw new IllegalArgumentException("this accumulator only supports CountFacetRequest; got " + fr);
      }
      if (fr.categoryPath.length != 1) {
        throw new IllegalArgumentException("this accumulator only supports 1-level CategoryPath; got " + fr.categoryPath);
      }
      if (fr.getDepth() != 1) {
        throw new IllegalArgumentException("this accumulator only supports depth=1; got " + fr.getDepth());
      }
      String dim = fr.categoryPath.components[0];

      SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim);
      if (ordRange == null) {
        throw new IllegalArgumentException("dim \"" + dim + "\" does not exist");
      }
    }
  }

  /** Keeps highest count results. */
  static class TopCountPQ extends PriorityQueue<FacetResultNode> {
    public TopCountPQ(int topN) {
      super(topN, false);
    }

    @Override
    protected boolean lessThan(FacetResultNode a, FacetResultNode b) {
      if (a.value < b.value) {
        return true;
      } else if (a.value > b.value) {
        return false;
      } else {
        return a.ordinal > b.ordinal;
      }
    }
  }

  static class SortedSetAggregator {

    private final SortedSetDocValuesReaderState state;
    private final String field;
    private final SortedSetDocValues dv;
   
    public SortedSetAggregator(String field, SortedSetDocValuesReaderState state, SortedSetDocValues dv) {
      this.field = field;
      this.state = state;
      this.dv = dv;
    }
   
    public void aggregate(MatchingDocs matchingDocs, FacetArrays facetArrays) throws IOException {

      AtomicReader reader = matchingDocs.context.reader();

      // LUCENE-5090: make sure the provided reader context "matches"
      // the top-level reader passed to the
      // SortedSetDocValuesReaderState, else cryptic
      // AIOOBE can happen:
      if (ReaderUtil.getTopLevelContext(matchingDocs.context).reader() != state.origReader) {
        throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader");
      }
     
      SortedSetDocValues segValues = reader.getSortedSetDocValues(field);
      if (segValues == null) {
        return;
      }

      final int[] counts = facetArrays.getIntArray();
      final int maxDoc = reader.maxDoc();
      assert maxDoc == matchingDocs.bits.length();

      if (dv instanceof MultiSortedSetDocValues) {
        MultiDocValues.OrdinalMap ordinalMap = ((MultiSortedSetDocValues) dv).mapping;
        int segOrd = matchingDocs.context.ord;

        int numSegOrds = (int) segValues.getValueCount();

        if (matchingDocs.totalHits < numSegOrds/10) {
          // Remap every ord to global ord as we iterate:
          int doc = 0;
          while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
            segValues.setDocument(doc);
            int term = (int) segValues.nextOrd();
            while (term != SortedSetDocValues.NO_MORE_ORDS) {
              counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++;
              term = (int) segValues.nextOrd();
            }
            ++doc;
          }
        } else {

          // First count in seg-ord space:
          final int[] segCounts = new int[numSegOrds];
          int doc = 0;
          while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
            segValues.setDocument(doc);
            int term = (int) segValues.nextOrd();
            while (term != SortedSetDocValues.NO_MORE_ORDS) {
              segCounts[term]++;
              term = (int) segValues.nextOrd();
            }
            ++doc;
          }

          // Then, migrate to global ords:
          for(int ord=0;ord<numSegOrds;ord++) {
            int count = segCounts[ord];
            if (count != 0) {
              counts[(int) ordinalMap.getGlobalOrd(segOrd, ord)] += count;
            }
          }
        }
      } else {
        // No ord mapping (e.g., single segment index):
        // just aggregate directly into counts:

        int doc = 0;
        while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
          segValues.setDocument(doc);
          int term = (int) segValues.nextOrd();
          while (term != SortedSetDocValues.NO_MORE_ORDS) {
            counts[term]++;
            term = (int) segValues.nextOrd();
          }
          ++doc;
        }
      }
    }

  }
 
  @Override
  public List<FacetResult> accumulate(List<MatchingDocs> matchingDocs) throws IOException {

    SortedSetAggregator aggregator = new SortedSetAggregator(field, state, dv);
    for (MatchingDocs md : matchingDocs) {
      aggregator.aggregate(md, facetArrays);
    }

    // compute top-K
    List<FacetResult> results = new ArrayList<FacetResult>();

    int[] counts = facetArrays.getIntArray();

    BytesRef scratch = new BytesRef();

    for (FacetRequest request : searchParams.facetRequests) {
      String dim = request.categoryPath.components[0];
      SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim);
      // checked in ctor:
      assert ordRange != null;

      if (request.numResults >= ordRange.end - ordRange.start + 1) {
        // specialize this case, user is interested in all available results
        ArrayList<FacetResultNode> nodes = new ArrayList<FacetResultNode>();
        int dimCount = 0;
        for(int ord=ordRange.start; ord<=ordRange.end; ord++) {
          //System.out.println("  ord=" + ord + " count= "+ counts[ord] + " bottomCount=" + bottomCount);
          if (counts[ord] != 0) {
            dimCount += counts[ord];
            FacetResultNode node = new FacetResultNode(ord, counts[ord]);
            dv.lookupOrd(ord, scratch);
            node.label = new CategoryPath(scratch.utf8ToString().split(state.separatorRegex, 2));
            nodes.add(node);
          }
        }

        Collections.sort(nodes, new Comparator<FacetResultNode>() {
            @Override
            public int compare(FacetResultNode o1, FacetResultNode o2) {
              // First by highest count
              int value = (int) (o2.value - o1.value);
              if (value == 0) {
                // ... then by lowest ord:
                value = o1.ordinal - o2.ordinal;
              }
              return value;
            }
          });
     
        CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim);
        if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) {
          dimCount = 0;
        }

        FacetResultNode rootNode = new FacetResultNode(-1, dimCount);
        rootNode.label = new CategoryPath(new String[] {dim});
        rootNode.subResults = nodes;
        results.add(new FacetResult(request, rootNode, nodes.size()));
        continue;
      }

      TopCountPQ q = new TopCountPQ(request.numResults);

      int bottomCount = 0;

      //System.out.println("collect");
      int dimCount = 0;
      int childCount = 0;
      FacetResultNode reuse = null;
      for(int ord=ordRange.start; ord<=ordRange.end; ord++) {
        //System.out.println("  ord=" + ord + " count= "+ counts[ord] + " bottomCount=" + bottomCount);
        if (counts[ord] > 0) {
          childCount++;
          if (counts[ord] > bottomCount) {
            dimCount += counts[ord];
            //System.out.println("    keep");
            if (reuse == null) {
              reuse = new FacetResultNode(ord, counts[ord]);
            } else {
              reuse.ordinal = ord;
              reuse.value = counts[ord];
            }
            reuse = q.insertWithOverflow(reuse);
            if (q.size() == request.numResults) {
              bottomCount = (int) q.top().value;
              //System.out.println("    new bottom=" + bottomCount);
            }
          }
        }
      }

      CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim);
      if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) {
        dimCount = 0;
      }

      FacetResultNode rootNode = new FacetResultNode(-1, dimCount);
      rootNode.label = new CategoryPath(new String[] {dim});

      FacetResultNode[] childNodes = new FacetResultNode[q.size()];
      for(int i=childNodes.length-1;i>=0;i--) {
        childNodes[i] = q.pop();
        dv.lookupOrd(childNodes[i].ordinal, scratch);
        childNodes[i].label = new CategoryPath(scratch.utf8ToString().split(state.separatorRegex, 2));
      }
      rootNode.subResults = Arrays.asList(childNodes);
     
      results.add(new FacetResult(request, rootNode, childCount));
    }

    return results;
  }
 
  @Override
  public boolean requiresDocScores() {
    return false;
  }
 
}
TOP

Related Classes of org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator$TopCountPQ

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.