Examples of org.apache.lucene.codecs.PostingsConsumer

org.apache.lucene.codecs.PostingsConsumer
Abstract API that consumes postings for an individual term.
The lifecycle is:
1. PostingsConsumer is returned for each term by {@link TermsConsumer#startTerm(BytesRef)}.
2. {@link #startDoc(int,int)} is called for eachdocument where the term occurs, specifying id and term frequency for that document.
3. If positions are enabled for the field, then {@link #addPosition(int,BytesRef,int,int)}will be called for each occurrence in the document.
4. {@link #finishDoc()} is called when the produceris done adding positions to the document.
@lucene.experimental

      // of this loop, including merge sort of terms from
      // multiple threads and interacting with the
      // TermsConsumer, only calling out to us (passing us the
      // DocsConsumer) to handle delivery of docs/positions


      final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text);


      final int delDocLimit;
      if (segDeletes != null) {
        protoTerm.bytes = text;
        final Integer docIDUpto = segDeletes.get(protoTerm);
        if (docIDUpto != null) {
          delDocLimit = docIDUpto;
        } else {
          delDocLimit = 0;
        }
      } else {
        delDocLimit = 0;
      }


      // Now termStates has numToMerge FieldMergeStates
      // which all share the same term.  Now we must
      // interleave the docID streams.
      int docFreq = 0;
      long totTF = 0;
      int docID = 0;


      while(true) {
        //System.out.println("  cycle");
        final int termFreq;
        if (freq.eof()) {
          if (postings.lastDocCodes[termID] != -1) {
            // Return last doc
            docID = postings.lastDocIDs[termID];
            if (readTermFreq) {
              termFreq = postings.termFreqs[termID];
            } else {
              termFreq = -1;
            }
            postings.lastDocCodes[termID] = -1;
          } else {
            // EOF
            break;
          }
        } else {
          final int code = freq.readVInt();
          if (!readTermFreq) {
            docID += code;
            termFreq = -1;
          } else {
            docID += code >>> 1;
            if ((code & 1) != 0) {
              termFreq = 1;
            } else {
              termFreq = freq.readVInt();
            }
          }


          assert docID != postings.lastDocIDs[termID];
        }


        docFreq++;
        assert docID < state.segmentInfo.getDocCount(): "doc=" + docID + " maxDoc=" + state.segmentInfo.getDocCount();


        // NOTE: we could check here if the docID was
        // deleted, and skip it.  However, this is somewhat
        // dangerous because it can yield non-deterministic
        // behavior since we may see the docID before we see
        // the term that caused it to be deleted.  This
        // would mean some (but not all) of its postings may
        // make it into the index, which'd alter the docFreq
        // for those terms.  We could fix this by doing two
        // passes, ie first sweep marks all del docs, and
        // 2nd sweep does the real flush, but I suspect
        // that'd add too much time to flush.
        visitedDocs.set(docID);
        postingsConsumer.startDoc(docID, writeTermFreq ? termFreq : -1);
        if (docID < delDocLimit) {
          // Mark it deleted.  TODO: we could also skip
          // writing its postings; this would be
          // deterministic (just for this Term's docs).
          
          // TODO: can we do this reach-around in a cleaner way????
          if (state.liveDocs == null) {
            state.liveDocs = docState.docWriter.codec.liveDocsFormat().newLiveDocs(state.segmentInfo.getDocCount());
          }
          if (state.liveDocs.get(docID)) {
            state.delCountOnFlush++;
            state.liveDocs.clear(docID);
          }
        }


        totTF += termFreq;
        
        // Carefully copy over the prox + payload info,
        // changing the format to match Lucene's segment
        // format.


        if (readPositions || readOffsets) {
          // we did record positions (& maybe payload) and/or offsets
          int position = 0;
          int offset = 0;
          for(int j=0;j<termFreq;j++) {
            final BytesRef thisPayload;


            if (readPositions) {
              final int code = prox.readVInt();
              position += code >>> 1;


              if ((code & 1) != 0) {


                // This position has a payload
                final int payloadLength = prox.readVInt();


                if (payload == null) {
                  payload = new BytesRef();
                  payload.bytes = new byte[payloadLength];
                } else if (payload.bytes.length < payloadLength) {
                  payload.grow(payloadLength);
                }


                prox.readBytes(payload.bytes, 0, payloadLength);
                payload.length = payloadLength;
                thisPayload = payload;


              } else {
                thisPayload = null;
              }


              if (readOffsets) {
                final int startOffset = offset + prox.readVInt();
                final int endOffset = startOffset + prox.readVInt();
                if (writePositions) {
                  if (writeOffsets) {
                    assert startOffset >=0 && endOffset >= startOffset : "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset;
                    postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset);
                  } else {
                    postingsConsumer.addPosition(position, thisPayload, -1, -1);
                  }
                }
                offset = startOffset;
              } else if (writePositions) {
                postingsConsumer.addPosition(position, thisPayload, -1, -1);
              }
            }
          }
        }
        postingsConsumer.finishDoc();
      }
      termsConsumer.finishTerm(text, new TermStats(docFreq, writeTermFreq ? totTF : -1));
      sumTotalTermFreq += totTF;
      sumDocFreq += docFreq;
    }

View Full Code Here

    public int compareTo(final TermData o) {
      return text.compareTo(o.text);
    }


    public long write(final TermsConsumer termsConsumer) throws Throwable {
      final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text);
      long totTF = 0;
      for(int i=0;i<docs.length;i++) {
        final int termDocFreq;
        if (field.omitTF) {
          termDocFreq = -1;
        } else {
          termDocFreq = positions[i].length;
        }
        postingsConsumer.startDoc(docs[i], termDocFreq);
        if (!field.omitTF) {
          totTF += positions[i].length;
          for(int j=0;j<positions[i].length;j++) {
            final PositionData pos = positions[i][j];
            postingsConsumer.addPosition(pos.pos, pos.payload, -1, -1);
          }
        }
        postingsConsumer.finishDoc();
      }
      termsConsumer.finishTerm(text, new TermStats(docs.length, field.omitTF ? -1 : totTF));
      return totTF;
    }

View Full Code Here

      // of this loop, including merge sort of terms from
      // multiple threads and interacting with the
      // TermsConsumer, only calling out to us (passing us the
      // DocsConsumer) to handle delivery of docs/positions


      final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text);


      final int delDocLimit;
      if (segDeletes != null) {
        protoTerm.bytes = text;
        final Integer docIDUpto = segDeletes.get(protoTerm);
        if (docIDUpto != null) {
          delDocLimit = docIDUpto;
        } else {
          delDocLimit = 0;
        }
      } else {
        delDocLimit = 0;
      }


      // Now termStates has numToMerge FieldMergeStates
      // which all share the same term.  Now we must
      // interleave the docID streams.
      int docFreq = 0;
      long totalTermFreq = 0;
      int docID = 0;


      while(true) {
        //System.out.println("  cycle");
        final int termFreq;
        if (freq.eof()) {
          if (postings.lastDocCodes[termID] != -1) {
            // Return last doc
            docID = postings.lastDocIDs[termID];
            if (readTermFreq) {
              termFreq = postings.termFreqs[termID];
            } else {
              termFreq = -1;
            }
            postings.lastDocCodes[termID] = -1;
          } else {
            // EOF
            break;
          }
        } else {
          final int code = freq.readVInt();
          if (!readTermFreq) {
            docID += code;
            termFreq = -1;
          } else {
            docID += code >>> 1;
            if ((code & 1) != 0) {
              termFreq = 1;
            } else {
              termFreq = freq.readVInt();
            }
          }


          assert docID != postings.lastDocIDs[termID];
        }


        docFreq++;
        assert docID < state.segmentInfo.getDocCount(): "doc=" + docID + " maxDoc=" + state.segmentInfo.getDocCount();


        // NOTE: we could check here if the docID was
        // deleted, and skip it.  However, this is somewhat
        // dangerous because it can yield non-deterministic
        // behavior since we may see the docID before we see
        // the term that caused it to be deleted.  This
        // would mean some (but not all) of its postings may
        // make it into the index, which'd alter the docFreq
        // for those terms.  We could fix this by doing two
        // passes, ie first sweep marks all del docs, and
        // 2nd sweep does the real flush, but I suspect
        // that'd add too much time to flush.
        visitedDocs.set(docID);
        postingsConsumer.startDoc(docID, writeTermFreq ? termFreq : -1);
        if (docID < delDocLimit) {
          // Mark it deleted.  TODO: we could also skip
          // writing its postings; this would be
          // deterministic (just for this Term's docs).
          
          // TODO: can we do this reach-around in a cleaner way????
          if (state.liveDocs == null) {
            state.liveDocs = docState.docWriter.codec.liveDocsFormat().newLiveDocs(state.segmentInfo.getDocCount());
          }
          if (state.liveDocs.get(docID)) {
            state.delCountOnFlush++;
            state.liveDocs.clear(docID);
          }
        }


        totalTermFreq += termFreq;
        
        // Carefully copy over the prox + payload info,
        // changing the format to match Lucene's segment
        // format.


        if (readPositions || readOffsets) {
          // we did record positions (& maybe payload) and/or offsets
          int position = 0;
          int offset = 0;
          for(int j=0;j<termFreq;j++) {
            final BytesRef thisPayload;


            if (readPositions) {
              final int code = prox.readVInt();
              position += code >>> 1;


              if ((code & 1) != 0) {


                // This position has a payload
                final int payloadLength = prox.readVInt();


                if (payload == null) {
                  payload = new BytesRef();
                  payload.bytes = new byte[payloadLength];
                } else if (payload.bytes.length < payloadLength) {
                  payload.grow(payloadLength);
                }


                prox.readBytes(payload.bytes, 0, payloadLength);
                payload.length = payloadLength;
                thisPayload = payload;


              } else {
                thisPayload = null;
              }


              if (readOffsets) {
                final int startOffset = offset + prox.readVInt();
                final int endOffset = startOffset + prox.readVInt();
                if (writePositions) {
                  if (writeOffsets) {
                    assert startOffset >=0 && endOffset >= startOffset : "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset;
                    postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset);
                  } else {
                    postingsConsumer.addPosition(position, thisPayload, -1, -1);
                  }
                }
                offset = startOffset;
              } else if (writePositions) {
                postingsConsumer.addPosition(position, thisPayload, -1, -1);
              }
            }
          }
        }
        postingsConsumer.finishDoc();
      }
      termsConsumer.finishTerm(text, new TermStats(docFreq, writeTermFreq ? totalTermFreq : -1));
      sumTotalTermFreq += totalTermFreq;
      sumDocFreq += docFreq;
    }

View Full Code Here

        SeedPostings postings = getSeedPostings(term.utf8ToString(), termEnt.getValue(), false, maxAllowed);
        if (VERBOSE) {
          System.out.println("  term=" + field + ":" + term.utf8ToString() + " docFreq=" + postings.docFreq + " seed=" + termEnt.getValue());
        }
        
        PostingsConsumer postingsConsumer = termsConsumer.startTerm(term);
        long totalTF = 0;
        int docID = 0;
        while((docID = postings.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
          final int freq = postings.freq();
          if (VERBOSE) {
            System.out.println("    " + postings.upto + ": docID=" + docID + " freq=" + postings.freq);
          }
          postingsConsumer.startDoc(docID, doFreq ? postings.freq : -1);
          seenDocs.set(docID);
          if (doPos) {
            totalTF += postings.freq;
            for(int posUpto=0;posUpto<freq;posUpto++) {
              int pos = postings.nextPosition();
              BytesRef payload = postings.getPayload();


              if (VERBOSE) {
                if (doPayloads) {
                  System.out.println("      pos=" + pos + " payload=" + (payload == null ? "null" : payload.length + " bytes"));
                } else {
                  System.out.println("      pos=" + pos);
                }
              }
              postingsConsumer.addPosition(pos, doPayloads ? payload : null,
                                           doOffsets ? postings.startOffset() : -1,
                                           doOffsets ? postings.endOffset() : -1);
            }
          } else if (doFreq) {
            totalTF += freq;
          } else {
            totalTF++;
          }
          postingsConsumer.finishDoc();
        }
        termsConsumer.finishTerm(term, new TermStats(postings.docFreq, doFreq ? totalTF : -1));
        sumTotalTF += totalTF;
        sumDF += postings.docFreq;
      }

View Full Code Here

0 1

TOP

Related Classes of org.apache.lucene.codecs.PostingsConsumer

org.apache.lucene.index.BasePostingsFormatTestCase

org.apache.lucene.index.DocsAndPositionsEnum

org.apache.lucene.index.FreqProxTermsWriterPerField

org.apache.lucene.index.TestCodecs$TermData

org.apache.lucene.util.BytesRef

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.