Package org.terrier.indexing

Examples of org.terrier.indexing.Document


      }
      assert documentCollection != null;
      while (documentCollection.nextDocument())
      {
        // Get Document
        Document tempDoc = documentCollection.getDocument();
        if (tempDoc != null)
        {
          // Retrieve Document's Unique ID
          if (tempDoc.getProperty("docno") == null)
          {
            throw new IOException("Collection returned null as docno");
          }
          else
          {
            DocID.set(tempDoc.getProperty("docno"));
          }
          document.setObject(tempDoc);
          currentDocument++;
          return true;
        }       
View Full Code Here


    throws IOException
  {
    final String docno = key.toString();
    currentReporter = reporter;
    reporter.setStatus("Currently indexing "+docno);
    final Document doc = value.getObject();
   
    if (start) {
      splitnum = value.getSplitIndex();
      System.out.println(splitnum);
      //RunData.writeInt(splitnum);
      start = false;
    }
   
    this.outputPostingListCollector = _outputPostingListCollector;
   
    /* setup for parsing */
    createDocumentPostings();
    String term;//term we're currently processing
    numOfTokensInDocument = 0;
    //numberOfDocuments++;
    //get each term in the document
    while (!doc.endOfDocument()) {
      reporter.progress();
      if ((term = doc.getNextTerm())!=null && !term.equals("")) {
        termFields = doc.getFields();
        /* pass term into TermPipeline (stop, stem etc) */
        pipeline_first.processTerm(term);

        /* the term pipeline will eventually add the term to this object. */
      }
      if (MAX_TOKENS_IN_DOCUMENT > 0 &&
          numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT)
        break;
    }
   
    //if we didn't index all tokens from document,
    //we need tocurrentId get to the end of the document.
    while (!doc.endOfDocument()){
      doc.getNextTerm();
    }
    /* we now have all terms in the DocumentTree, so we save the document tree */
    if (termsInDocument.getDocumentLength() == 0)
    {  /* this document is empty, add the minimum to the document index */
      // Nothing in the ifile
      indexEmpty(doc.getAllProperties());
    }
    else
    /* index this document */
      try{
        indexDocument(doc.getAllProperties(), termsInDocument);
        numberOfTokens += numOfTokensInDocument;
        reporter.incrCounter(Counters.INDEXED_TOKENS, numOfTokensInDocument);
        reporter.incrCounter(Counters.INDEXED_POINTERS, termsInDocument.getNumberOfPointers());
      } catch (IOException ioe) {
        throw ioe;       
View Full Code Here

TOP

Related Classes of org.terrier.indexing.Document

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.