Source Code of org.apache.solr.request.uninverted.MakeUnivertedFieldByIndex

package org.apache.solr.request.uninverted;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.PriorityQueue;
import java.util.zip.CRC32;


import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.solr.request.mdrill.MdrillUtils;
import org.apache.solr.request.uninverted.UnInvertedFieldUtils.FieldDatatype;
import org.apache.solr.request.uninverted.UnInvertedFieldUtils.MixTermInfo;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.search.BitDocSet;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.DocSetCollector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;




/**
 * 历史的旧方法，通过遍历倒排表来实现，效率不高
 * @author yannian.mu
 *
 */
public class MakeUnivertedFieldByIndex {
  public static Logger log = LoggerFactory.getLogger(MakeUnivertedFieldByIndex.class);


  private static int SKIP_STEP = 32;
  private static int SKIP_MIN = 64;
  private static int MAX_SKIP_COUNT = 102400;
  
  private static long TD_MAX_CMP_COUNT = 64000000l;
  private static int TD_LIMIT_MAX = 10240;
  private static int TD_LIMIT_MIN = 8;
  
  private int[] docs = new int[1000];
  private int[] freqs = new int[1000];
  
  private UnInvertedField uni;


  public MakeUnivertedFieldByIndex(UnInvertedField uni) throws IOException {
    this.uni=uni;
  }
  
  public void makeInit(BitDocSet baseAdvanceDocs,String field, IndexSchema schema,IndexReader reader) throws IOException
  {
    uni.init(field, reader, schema);
    uni.baseAdvanceDocs=UnInvertedField.ajustBase(48,baseAdvanceDocs, reader);


    if(this.uni.checkEmpty())
    {
      return ;
    }
    
    log.info(" makeInit  begin " + this.uni.field + " field " +",baseAdvanceDocs="+(this.uni.baseAdvanceDocs==null?"null":this.uni.baseAdvanceDocs.size())+"@"+(baseAdvanceDocs==null?"null":baseAdvanceDocs.size()));
      
    TermNumEnumerator te = uni.ti.getEnumerator(reader);


    int maxDoc = reader.maxDoc();


    this.uni.startRamDocValue(maxDoc, reader, true);
    
    int limitsize=this.getLimitSize();


    PriorityQueue<MixTermInfo> termDocslist=new PriorityQueue<MixTermInfo>(limitsize,Collections.reverseOrder(UnInvertedFieldUtils.TD_CMP));


    TermDocs tdreader=reader.termDocs(1024);
    int maxTermNum=0;
    for (;;) {
      Term t = te.term();
      if (t == null) {
        break;
      }


      int termNum = te.getTermNumber();
      
      if(termNum%100000==0)
      {
        log.info("termsInverted " +termNum+"@"+ this.uni.field + ",limitsize=" + limitsize);
      }
      
      int df=te.docFreq();
      
      if(!this.isFinish())
      {
        TermDocs td = te.getTermDocs();
        if(df<=SKIP_MIN||this.uni.baseAdvanceDocs==null)
        {
          td.seek(te);
          this.set_Doc2TermNum_NonSkip(td, docs, freqs, termNum,true,maxDoc);
        } else {


          MixTermInfo cl = new MixTermInfo(df, termNum, tdreader,  new Term(t.field(), t.text()));
          if (termDocslist.size() < limitsize) {
            termDocslist.add(cl);
          } else {
            MixTermInfo peek = termDocslist.peek();
            if (UnInvertedFieldUtils.TD_CMP.compare(peek, cl) > 0 && cl.getCount() / (peek.getCount() + 1) > 1.5) {
              termDocslist.add(cl);
              MixTermInfo cl_old = termDocslist.poll();
              this.set_Doc2TermNum_NonSkip(cl_old.getTd(), docs,freqs, cl_old.getTermNum(), true, maxDoc);
            } else {
              td.seek(te);
              this.set_Doc2TermNum_NonSkip(td, docs, freqs,termNum, true, maxDoc);
            }
          }
        }
      }
      
      maxTermNum=Math.max(maxTermNum, termNum);


      this.setTermNumValue(t, termNum);
      te.next();
    }
    
    this.PriorityQueue_skip_set(termDocslist, maxDoc,true);




    this.uni.endRamDocValue(true,maxTermNum);


    tdreader.close();
    te.close();


    this.uni.tnr = this.uni.ramDocValue.getDocReader();
  
    this.setTdIndex_NULL();
  }
  
  
  public void setTdIndex_NULL() throws IOException {
    if (this.uni.baseAdvanceDocs == null) {
      return;
    }


    log.info("setTdIndex_NULL :"+this.uni.baseAdvanceDocs.size());
    DocIterator iter = this.uni.baseAdvanceDocs.iterator();
    while (iter.hasNext()) {
      int doc = iter.nextDoc();
      this.uni.bits.add(doc);
      this.uni.markDocTm(doc, this.uni.getNullTm(),false);
    }
    
    this.uni.baseAdvanceDocs=null;
  }
  
  
  public void addDoclist(BitDocSet baseAdvanceDocs,String field, 
      IndexReader reader) throws IOException {
    
    if (uni.checkEmpty()) {
      return ;
    }


    BitDocSet tmp=null;
    if(baseAdvanceDocs!=null)
    {
      tmp=(BitDocSet) baseAdvanceDocs.andNot(this.uni.bits);


      if(tmp!=null&&tmp.size()<=0)
      {
        return ;
      }
    }
  
    this.uni.baseAdvanceDocs=UnInvertedField.ajustBase(48,tmp, reader);
    TermNumEnumerator te = uni.ti.getEnumerator(reader);


    log.info("addDoclist start " + this.uni.field +",baseAdvanceDocs="+(this.uni.baseAdvanceDocs==null?"null":this.uni.baseAdvanceDocs.size())+"@"+(baseAdvanceDocs==null?"null":baseAdvanceDocs.size()));


    int maxDoc=reader.maxDoc();
    int limitsize=this.getLimitSize();


    PriorityQueue<MixTermInfo> termDocslist=new PriorityQueue<MixTermInfo>(limitsize,Collections.reverseOrder(UnInvertedFieldUtils.TD_CMP));
    TermDocs tdreader=reader.termDocs(1024);


    for (;;) {
      Term t = te.term();
      if (t == null) {
        break;
      }


      if (this.isFinish()) {
        break;
      }


      int termNum = te.getTermNumber();


      if (termNum%10000==0) {
        log.info("termsInverted " +termNum+"@"+ this.uni.field + ",limitsize=" + limitsize);
      }
      TermDocs td = te.getTermDocs();


      int df = te.docFreq();
      if (df <= SKIP_MIN || this.uni.baseAdvanceDocs == null) {
        td.seek(te);
        this.set_Doc2TermNum_NonSkip(td, docs, freqs, termNum, false, maxDoc);
      } else {
        MixTermInfo cl = new MixTermInfo(df, termNum, tdreader,  new Term(t.field(), t.text()));
        if (termDocslist.size() < limitsize) {
          termDocslist.add(cl);
        } else {
          MixTermInfo peek = termDocslist.peek();
          if (UnInvertedFieldUtils.TD_CMP.compare(peek, cl) > 0 && cl.getCount() / (peek.getCount() + 1) > 1.5) {
            termDocslist.add(cl);
            MixTermInfo cl_old = termDocslist.poll();
            this.set_Doc2TermNum_NonSkip(cl_old.getTd(), docs,  freqs, cl_old.getTermNum(), false, maxDoc);
          } else {
            td.seek(te);
            this.set_Doc2TermNum_NonSkip(td, docs, freqs, termNum, false, maxDoc);
          }
        }
      }
      te.next();
    }
    
    this.PriorityQueue_skip_set(termDocslist, maxDoc,false);
    
    te.close();
    this.setTdIndex_NULL();
  }
  
  private boolean isFinish()
  {
    if(this.uni.baseAdvanceDocs==null)
    {
      return false;
    }
    
    return this.uni.baseAdvanceDocs.size()<=0;
  }
  
  private void cleanBase(DocSetCollector collect)
  {
    DocSet docsit=collect.getDocSet();
    DocIterator toremove = docsit.iterator();
    while (toremove.hasNext()) {
      int doc = toremove.nextDoc();
      this.uni.baseAdvanceDocs.clear(doc);
    }
    
    


  }


  private int getLimitSize()
  {
    int limitsize=TD_LIMIT_MAX;
    if(this.uni.baseAdvanceDocs!=null)
    {
      limitsize=(int) Math.min(limitsize, TD_MAX_CMP_COUNT/(1+this.uni.baseAdvanceDocs.size()));
    }
    limitsize=Math.max(limitsize, TD_LIMIT_MIN);
    return limitsize;
  }
  
  private void set_Doc2TermNum_NonSkip(TermDocs td,int[] docs,int[] freqs,int termNum,boolean isinit,int maxDoc) throws IOException
  {
    if(this.uni.baseAdvanceDocs==null)
    {
      for (;;) {
        int n = td.read(docs, freqs);
        if (n <= 0) {
          break;
        }
        for (int i = 0; i < n; i++) {
          int docid=docs[i];
          this.uni.bits.add(docid);
          this.uni.markDocTm(docid, termNum, isinit);
        }
      }
      return ;
    }
    
    DocSetCollector collect=new DocSetCollector(10240, maxDoc);
    for (;;) {
      int n = td.read(docs, freqs);
      if (n <= 0) {
        break;
      }
      for (int i = 0; i < n; i++) {
        int docid=docs[i];
        collect.collect(docid);
        this.uni.bits.add(docid);
        this.uni.markDocTm(docid, termNum, isinit);
      }
    }


    this.cleanBase(collect);


  }


  private int set_Doc2TermNum_Skip(TermDocs td,int[] docs,int[] freqs,int termNum,boolean isinit,int maxDoc) throws IOException
  {
    int skipcount=0;
      
    DocSetCollector collect=new DocSetCollector(10240, maxDoc);
    DocIterator iter = this.uni.baseAdvanceDocs.iterator();
    int doc=-1;
    int baseDoc=-1;
    
    while (iter.hasNext()) {
      doc = iter.nextDoc();
      if(doc<baseDoc)
      {
        continue;
      }
      
      
      if(doc>baseDoc)
      {
        int diff=doc-baseDoc;
        if(diff>=SKIP_STEP)
        {
          if(baseDoc>=0)
          {
            skipcount++;
          }
          baseDoc=UnInvertedFieldUtils.advance(td, doc);
          if(UnInvertedFieldUtils.NO_MORE_DOCS==baseDoc)
          {
            break;
          }
        }else{
          boolean is_no_more_docs=false;
          for(int i=0;i<=SKIP_STEP;i++)
          {
            if(td.next())
            {
              baseDoc=td.doc();
              if(baseDoc>=doc)
              {
                break;
              }
              collect.collect(baseDoc);
              this.uni.bits.add(baseDoc);
              this.uni.markDocTm(baseDoc, termNum, isinit);
              
            }else{
              is_no_more_docs=true;
              break;
            }
          }
          if(is_no_more_docs)
          {
            break;
          }
        }
      }
      
      if(baseDoc==doc)
      {
        this.uni.bits.add(doc);
        collect.collect(doc);
        this.uni.markDocTm(doc, termNum, isinit);
      }
    }
    
    
    this.cleanBase(collect);
    
    return skipcount;
  }
  
  private int PriorityQueue_skip_set(PriorityQueue<MixTermInfo> termDocslist,int maxDoc,boolean isinit) throws IOException
  {
    int skipcount=0;
    ArrayList<MixTermInfo> sorted=new ArrayList<MixTermInfo>(termDocslist.size());
    sorted.addAll(termDocslist);
    Collections.sort(sorted, UnInvertedFieldUtils.TD_CMP_TM);
    for(MixTermInfo cl:sorted)
    {
      if(this.isFinish())
      {
        break;
      }


      if(skipcount>MAX_SKIP_COUNT)
      {
        this.set_Doc2TermNum_NonSkip(cl.getTd(), docs, freqs,cl.getTermNum(),isinit,maxDoc);
      }else{
        skipcount+=this.set_Doc2TermNum_Skip(cl.getTd(), docs, freqs, cl.getTermNum(),isinit,maxDoc);
      }
    
    }
    
    return skipcount;
  }
  
  private void setTermNumValue(Term t,int termNum)
  {
    if (this.uni.fieldDataType == FieldDatatype.d_long) {
      this.uni.setTmValueLong(termNum,Long.parseLong(this.uni.ft.indexedToReadable(t.text()))) ;
    } else if (this.uni.fieldDataType == FieldDatatype.d_double) {
      this.uni.setTmValueDouble(termNum, MdrillUtils.ParseDouble(this.uni.ft.indexedToReadable(t.text())));
    } else if (this.uni.fieldDataType == FieldDatatype.d_string) {// for dist
      CRC32 crc32 = new CRC32();
      crc32.update(new String(this.uni.ft.indexedToReadable(t.text())).getBytes());
      this.uni.setTmValueLong(termNum, crc32.getValue());
    }
  }


}
Source Code of org.apache.solr.request.uninverted.MakeUnivertedFieldByIndex

Related Classes of org.apache.solr.request.uninverted.MakeUnivertedFieldByIndex