Examples of org.apache.lucene.index.SortedSetDocValues

org.apache.lucene.index.SortedSetDocValues
A per-document set of presorted byte[] values.
Per-Document values in a SortedDocValues are deduplicated, dereferenced, and sorted into a dictionary of unique values. A pointer to the dictionary value (ordinal) can be retrieved for each document. Ordinals are dense and in increasing sorted order.

    return new FacetsAggregator() {


      @Override
      public void aggregate(MatchingDocs matchingDocs, CategoryListParams clp, FacetArrays facetArrays) throws IOException {


        SortedSetDocValues segValues = matchingDocs.context.reader().getSortedSetDocValues(field);
        if (segValues == null) {
          return;
        }


        final int[] counts = facetArrays.getIntArray();
        final int maxDoc = matchingDocs.context.reader().maxDoc();
        assert maxDoc == matchingDocs.bits.length();


        if (dv instanceof MultiSortedSetDocValues) {
          MultiDocValues.OrdinalMap ordinalMap = ((MultiSortedSetDocValues) dv).mapping;
          int segOrd = matchingDocs.context.ord;


          int numSegOrds = (int) segValues.getValueCount();


          if (matchingDocs.totalHits < numSegOrds/10) {
            // Remap every ord to global ord as we iterate:
            int doc = 0;
            while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
              segValues.setDocument(doc);
              int term = (int) segValues.nextOrd();
              while (term != SortedSetDocValues.NO_MORE_ORDS) {
                counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++;
                term = (int) segValues.nextOrd();
              }
              ++doc;
            }
          } else {


            // First count in seg-ord space:
            final int[] segCounts = new int[numSegOrds];
            int doc = 0;
            while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
              segValues.setDocument(doc);
              int term = (int) segValues.nextOrd();
              while (term != SortedSetDocValues.NO_MORE_ORDS) {
                segCounts[term]++;
                term = (int) segValues.nextOrd();
              }
              ++doc;
            }


            // Then, migrate to global ords:
            for(int ord=0;ord<numSegOrds;ord++) {
              int count = segCounts[ord];
              if (count != 0) {
                counts[(int) ordinalMap.getGlobalOrd(segOrd, ord)] += count;
              }
            }
          }
        } else {
          // No ord mapping (e.g., single segment index):
          // just aggregate directly into counts:


          int doc = 0;
          while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
            segValues.setDocument(doc);
            int term = (int) segValues.nextOrd();
            while (term != SortedSetDocValues.NO_MORE_ORDS) {
              counts[term]++;
              term = (int) segValues.nextOrd();
            }
            ++doc;
          }
        }
      }

View Full Code Here

    if (reader instanceof AtomicReader) {
      topReader = (AtomicReader) reader;
    } else {
      topReader = new SlowCompositeReaderWrapper((CompositeReader) reader);
    }
    SortedSetDocValues dv = topReader.getSortedSetDocValues(field);
    if (dv == null) {
      throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues");
    }
    if (dv.getValueCount() > Integer.MAX_VALUE) {
      throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount());
    }
    valueCount = (int) dv.getValueCount();


    // TODO: we can make this more efficient if eg we can be
    // "involved" when OrdinalMap is being created?  Ie see
    // each term/ord it's assigning as it goes...
    String lastDim = null;
    int startOrd = -1;
    BytesRef spare = new BytesRef();


    // TODO: this approach can work for full hierarchy?;
    // TaxoReader can't do this since ords are not in
    // "sorted order" ... but we should generalize this to
    // support arbitrary hierarchy:
    for(int ord=0;ord<valueCount;ord++) {
      dv.lookupOrd(ord, spare);
      String[] components = spare.utf8ToString().split(separatorRegex, 2);
      if (components.length != 2) {
        throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + spare.utf8ToString());
      }
      if (!components[0].equals(lastDim)) {

View Full Code Here

    final Arc<Long> scratchArc = new Arc<Long>();
    final IntsRef scratchInts = new IntsRef();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst); 
    final BytesRef ref = new BytesRef();
    final ByteArrayDataInput input = new ByteArrayDataInput();
    return new SortedSetDocValues() {
      long currentOrd;


      @Override
      public long nextOrd() {
        if (input.eof()) {

View Full Code Here

   */
  public static DocTermOrdsRangeFilter newBytesRefRange(String field, BytesRef lowerVal, BytesRef upperVal, boolean includeLower, boolean includeUpper) {
    return new DocTermOrdsRangeFilter(field, lowerVal, upperVal, includeLower, includeUpper) {
      @Override
      public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
        final SortedSetDocValues docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), field);
        final long lowerPoint = lowerVal == null ? -1 : docTermOrds.lookupTerm(lowerVal);
        final long upperPoint = upperVal == null ? -1 : docTermOrds.lookupTerm(upperVal);


        final long inclusiveLowerPoint, inclusiveUpperPoint;


        // Hints:
        // * binarySearchLookup returns -1, if value was null.
        // * the value is <0 if no exact hit was found, the returned value
        //   is (-(insertion point) - 1)
        if (lowerPoint == -1 && lowerVal == null) {
          inclusiveLowerPoint = 0;
        } else if (includeLower && lowerPoint >= 0) {
          inclusiveLowerPoint = lowerPoint;
        } else if (lowerPoint >= 0) {
          inclusiveLowerPoint = lowerPoint + 1;
        } else {
          inclusiveLowerPoint = Math.max(0, -lowerPoint - 1);
        }
        
        if (upperPoint == -1 && upperVal == null) {
          inclusiveUpperPoint = Long.MAX_VALUE;  
        } else if (includeUpper && upperPoint >= 0) {
          inclusiveUpperPoint = upperPoint;
        } else if (upperPoint >= 0) {
          inclusiveUpperPoint = upperPoint - 1;
        } else {
          inclusiveUpperPoint = -upperPoint - 2;
        }      


        if (inclusiveUpperPoint < 0 || inclusiveLowerPoint > inclusiveUpperPoint) {
          return null;
        }
        
        assert inclusiveLowerPoint >= 0 && inclusiveUpperPoint >= 0;
        
        return new FieldCacheDocIdSet(context.reader().maxDoc(), acceptDocs) {
          @Override
          protected final boolean matchDoc(int doc) {
            docTermOrds.setDocument(doc);
            long ord;
            while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
              if (ord > inclusiveUpperPoint) {
                return false;
              } else if (ord >= inclusiveLowerPoint) {
                return true;
              }

View Full Code Here

  }


  // TODO: this if DocTermsIndex was already created, we
  // should share it...
  public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field) throws IOException {
    SortedSetDocValues dv = reader.getSortedSetDocValues(field);
    if (dv != null) {
      return dv;
    }
    
    SortedDocValues sdv = reader.getSortedDocValues(field);

View Full Code Here

     * Returns a DocIdSet with documents that should be permitted in search
     * results.
     */
    @Override
    public DocIdSet getDocIdSet(AtomicReaderContext context, final Bits acceptDocs) throws IOException {
      final SortedSetDocValues docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), query.field);
      // Cannot use FixedBitSet because we require long index (ord):
      final LongBitSet termSet = new LongBitSet(docTermOrds.getValueCount());
      TermsEnum termsEnum = query.getTermsEnum(new Terms() {
        
        @Override
        public Comparator<BytesRef> getComparator() {
          return BytesRef.getUTF8SortedAsUnicodeComparator();
        }
        
        @Override
        public TermsEnum iterator(TermsEnum reuse) {
          return docTermOrds.termsEnum();
        }


        @Override
        public long getSumTotalTermFreq() {
          return -1;
        }


        @Override
        public long getSumDocFreq() {
          return -1;
        }


        @Override
        public int getDocCount() {
          return -1;
        }


        @Override
        public long size() {
          return -1;
        }


        @Override
        public boolean hasFreqs() {
          return false;
        }


        @Override
        public boolean hasOffsets() {
          return false;
        }


        @Override
        public boolean hasPositions() {
          return false;
        }
        
        @Override
        public boolean hasPayloads() {
          return false;
        }
      });
      
      assert termsEnum != null;
      if (termsEnum.next() != null) {
        // fill into a bitset
        do {
          termSet.set(termsEnum.ord());
        } while (termsEnum.next() != null);
      } else {
        return null;
      }
      
      return new FieldCacheDocIdSet(context.reader().maxDoc(), acceptDocs) {
        @Override
        protected final boolean matchDoc(int doc) throws ArrayIndexOutOfBoundsException {
          docTermOrds.setDocument(doc);
          long ord;
          // TODO: we could track max bit set and early terminate (since they come in sorted order)
          while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
            if (termSet.get(ord)) {
              return true;
            }
          }
          return false;

View Full Code Here

    final Arc<Long> scratchArc = new Arc<>();
    final IntsRef scratchInts = new IntsRef();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
    final BytesRef ref = new BytesRef();
    final ByteArrayDataInput input = new ByteArrayDataInput();
    return new SortedSetDocValues() {
      long currentOrd;


      @Override
      public long nextOrd() {
        if (input.eof()) {

View Full Code Here

      // AIOOBE can happen:
      if (ReaderUtil.getTopLevelContext(hits.context).reader() != origReader) {
        throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader");
      }
      
      SortedSetDocValues segValues = reader.getSortedSetDocValues(field);
      if (segValues == null) {
        continue;
      }


      DocIdSetIterator docs = hits.bits.iterator();


      // TODO: yet another option is to count all segs
      // first, only in seg-ord space, and then do a
      // merge-sort-PQ in the end to only "resolve to
      // global" those seg ords that can compete, if we know
      // we just want top K?  ie, this is the same algo
      // that'd be used for merging facets across shards
      // (distributed faceting).  but this has much higher
      // temp ram req'ts (sum of number of ords across all
      // segs)
      if (ordinalMap != null) {
        int segOrd = hits.context.ord;


        int numSegOrds = (int) segValues.getValueCount();


        if (hits.totalHits < numSegOrds/10) {
          //System.out.println("    remap as-we-go");
          // Remap every ord to global ord as we iterate:
          int doc;
          while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            //System.out.println("    doc=" + doc);
            segValues.setDocument(doc);
            int term = (int) segValues.nextOrd();
            while (term != SortedSetDocValues.NO_MORE_ORDS) {
              //System.out.println("      segOrd=" + segOrd + " ord=" + term + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, term));
              counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++;
              term = (int) segValues.nextOrd();
            }
          }
        } else {
          //System.out.println("    count in seg ord first");


          // First count in seg-ord space:
          final int[] segCounts = new int[numSegOrds];
          int doc;
          while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            //System.out.println("    doc=" + doc);
            segValues.setDocument(doc);
            int term = (int) segValues.nextOrd();
            while (term != SortedSetDocValues.NO_MORE_ORDS) {
              //System.out.println("      ord=" + term);
              segCounts[term]++;
              term = (int) segValues.nextOrd();
            }
          }


          // Then, migrate to global ords:
          for(int ord=0;ord<numSegOrds;ord++) {
            int count = segCounts[ord];
            if (count != 0) {
              //System.out.println("    migrate segOrd=" + segOrd + " ord=" + ord + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, ord));
              counts[(int) ordinalMap.getGlobalOrd(segOrd, ord)] += count;
            }
          }
        }
      } else {
        // No ord mapping (e.g., single segment index):
        // just aggregate directly into counts:
        int doc;
        while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
          segValues.setDocument(doc);
          int term = (int) segValues.nextOrd();
          while (term != SortedSetDocValues.NO_MORE_ORDS) {
            counts[term]++;
            term = (int) segValues.nextOrd();
          }
        }
      }
    }
  }

View Full Code Here

    this.origReader = reader;


    // We need this to create thread-safe MultiSortedSetDV
    // per collector:
    topReader = SlowCompositeReaderWrapper.wrap(reader);
    SortedSetDocValues dv = topReader.getSortedSetDocValues(field);
    if (dv == null) {
      throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues");
    }
    if (dv.getValueCount() > Integer.MAX_VALUE) {
      throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount());
    }
    valueCount = (int) dv.getValueCount();


    // TODO: we can make this more efficient if eg we can be
    // "involved" when OrdinalMap is being created?  Ie see
    // each term/ord it's assigning as it goes...
    String lastDim = null;
    int startOrd = -1;
    BytesRef spare = new BytesRef();


    // TODO: this approach can work for full hierarchy?;
    // TaxoReader can't do this since ords are not in
    // "sorted order" ... but we should generalize this to
    // support arbitrary hierarchy:
    for(int ord=0;ord<valueCount;ord++) {
      dv.lookupOrd(ord, spare);
      String[] components = FacetsConfig.stringToPath(spare.utf8ToString());
      if (components.length != 2) {
        throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.toString(components) + " " + spare.utf8ToString());
      }
      if (!components[0].equals(lastDim)) {

View Full Code Here

  @Override
  public FieldComparator<?> getComparator(int numHits, int sortPos) throws IOException {
    return new FieldComparator.TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST) {
      @Override
      protected SortedDocValues getSortedDocValues(AtomicReaderContext context, String field) throws IOException {
        SortedSetDocValues sortedSet = FieldCache.DEFAULT.getDocTermOrds(context.reader(), field);
        
        if (sortedSet.getValueCount() >= Integer.MAX_VALUE) {
          throw new UnsupportedOperationException("fields containing more than " + (Integer.MAX_VALUE-1) + " unique terms are unsupported");
        }
        
        SortedDocValues singleton = DocValues.unwrapSingleton(sortedSet);
        if (singleton != null) {

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.index.SortedSetDocValues

org.apache.lucene.codecs.asserting.AssertingDocValuesFormat$AssertingDocValuesProducer

org.apache.lucene.codecs.cheapbastard.CheapBastardDocValuesProducer

org.apache.lucene.codecs.diskdv.DiskDocValuesProducer

org.apache.lucene.codecs.DocValuesConsumer

org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer

org.apache.lucene.codecs.lucene45.Lucene45DocValuesProducer

org.apache.lucene.codecs.memory.DirectDocValuesProducer

org.apache.lucene.codecs.memory.MemoryDocValuesProducer

org.apache.lucene.codecs.simpletext.SimpleTextDocValuesReader

org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.