Package org.apache.lucene.facet.sortedset

Source Code of org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState$OrdRange

package org.apache.lucene.facet.sortedset;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetIndexingParams;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.CompositeReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.util.BytesRef;

/** Wraps a {@link IndexReader} and resolves ords
*  using existing {@link SortedSetDocValues} APIs without a
*  separate taxonomy index.  This only supports flat facets
*  (dimension + label), and it makes faceting a bit
*  slower, adds some cost at reopen time, but avoids
*  managing the separate taxonomy index.  It also requires
*  less RAM than the taxonomy index, as it manages the flat
*  (2-level) hierarchy more efficiently.  In addition, the
*  tie-break during faceting is now meaningful (in label
*  sorted order).
*
<p><b>NOTE</b>: creating an instance of this class is
*  somewhat costly, as it computes per-segment ordinal maps,
*  so you should create it once and re-use that one instance
*  for a given {@link IndexReader}. */

public final class SortedSetDocValuesReaderState {

  private final String field;
  private final AtomicReader topReader;
  private final int valueCount;
  final IndexReader origReader;
  final char separator;
  final String separatorRegex;

  /** Extension added to {@link CategoryListParams#field}
   *  to determin which field to read/write facet ordinals from/to. */
  public static final String FACET_FIELD_EXTENSION = "_sorted_doc_values";

  /** Holds start/end range of ords, which maps to one
   *  dimension (someday we may generalize it to map to
   *  hierarchies within one dimension). */
  static final class OrdRange {
    /** Start of range, inclusive: */
    public final int start;
    /** End of range, inclusive: */
    public final int end;

    /** Start and end are inclusive. */
    public OrdRange(int start, int end) {
      this.start = start;
      this.end = end;
    }
  }

  private final Map<String,OrdRange> prefixToOrdRange = new HashMap<String,OrdRange>();

  /** Create an instance, scanning the {@link
   *  SortedSetDocValues} from the provided reader, with
   *  default {@link FacetIndexingParams}. */
  public SortedSetDocValuesReaderState(IndexReader reader) throws IOException {
    this(FacetIndexingParams.DEFAULT, reader);
  }

  /** Create an instance, scanning the {@link
   *  SortedSetDocValues} from the provided reader and
   *  {@link FacetIndexingParams}. */
  public SortedSetDocValuesReaderState(FacetIndexingParams fip, IndexReader reader) throws IOException {

    this.field = fip.getCategoryListParams(null).field + FACET_FIELD_EXTENSION;
    this.separator = fip.getFacetDelimChar();
    this.separatorRegex = Pattern.quote(Character.toString(separator));
    this.origReader = reader;

    // We need this to create thread-safe MultiSortedSetDV
    // per collector:
    topReader = SlowCompositeReaderWrapper.wrap(reader);
    SortedSetDocValues dv = topReader.getSortedSetDocValues(field);
    if (dv == null) {
      throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues");
    }
    if (dv.getValueCount() > Integer.MAX_VALUE) {
      throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount());
    }
    valueCount = (int) dv.getValueCount();

    // TODO: we can make this more efficient if eg we can be
    // "involved" when OrdinalMap is being created?  Ie see
    // each term/ord it's assigning as it goes...
    String lastDim = null;
    int startOrd = -1;
    BytesRef spare = new BytesRef();

    // TODO: this approach can work for full hierarchy?;
    // TaxoReader can't do this since ords are not in
    // "sorted order" ... but we should generalize this to
    // support arbitrary hierarchy:
    for(int ord=0;ord<valueCount;ord++) {
      dv.lookupOrd(ord, spare);
      String[] components = spare.utf8ToString().split(separatorRegex, 2);
      if (components.length != 2) {
        throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + spare.utf8ToString());
      }
      if (!components[0].equals(lastDim)) {
        if (lastDim != null) {
          prefixToOrdRange.put(lastDim, new OrdRange(startOrd, ord-1));
        }
        startOrd = ord;
        lastDim = components[0];
      }
    }

    if (lastDim != null) {
      prefixToOrdRange.put(lastDim, new OrdRange(startOrd, valueCount-1));
    }
  }

  SortedSetDocValues getDocValues() throws IOException {
    return topReader.getSortedSetDocValues(field);
  }

  OrdRange getOrdRange(String dim) {
    return prefixToOrdRange.get(dim);
  }

  String getField() {
    return field;
  }

  int getSize() {
    return valueCount;
  }
}
TOP

Related Classes of org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState$OrdRange

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.