Package org.terrier.structures

Examples of org.terrier.structures.DirectInvertedOutputStream


      ? new LexiconBuilder(currentIndex, "lexicon", new FieldLexiconMap(FieldScore.FIELDS_COUNT), FieldLexiconEntry.class.getName())
      : new LexiconBuilder(currentIndex, "lexicon", new LexiconMap(), BasicLexiconEntry.class.getName());
    try{
      directIndexBuilder = FieldScore.FIELDS_COUNT > 0
        ? new FieldDirectInvertedOutputStream(currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION)
        : new DirectInvertedOutputStream(currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION);
    } catch (IOException ioe) {
  //    logger.error("Cannot make DirectInvertedOutputStream:", ioe);
    }
      //  new DirectIndexBuilder(currentIndex, "direct");
    docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
View Full Code Here


      int newCodes = (int)srcIndex1.getCollectionStatistics().getNumberOfUniqueTerms();
     
      PostingIndex inverted1 = srcIndex1.getInvertedIndex();
      PostingIndex inverted2 = srcIndex2.getInvertedIndex();
     
      DirectInvertedOutputStream invOS =null;
      try{
        invOS = (fieldCount > 0 ? fieldInvertedFileOutputStreamClass : invertedFileOutputStreamClass)
          .getConstructor(String.class)
          .newInstance(destIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + 
            destIndex.getPrefix() + ".inverted"+ BitIn.USUAL_EXTENSION);
       
      } catch (Exception e) {
        logger.error("Couldn't create specified DirectInvertedOutputStream", e);
        return;
      }


      boolean hasMore1 = false;
      boolean hasMore2 = false;
      String term1;
      String term2;
      Map.Entry<String,LexiconEntry> lee1 = null;
      Map.Entry<String,LexiconEntry> lee2 = null;
      hasMore1 = lexInStream1.hasNext();
      if (hasMore1)
        lee1 = lexInStream1.next();
      hasMore2 = lexInStream2.hasNext();
      if (hasMore2)
        lee2 = lexInStream2.next();
      while (hasMore1 && hasMore2) {
   
        term1 = lee1.getKey();
        term2 = lee2.getKey();
       
        int lexicographicalCompare = term1.compareTo(term2);
        if (lexicographicalCompare < 0) {
          //write to inverted file postings for the term that only occurs in 1st index
          BitIndexPointer newPointer = invOS.writePostings(inverted1.getPostings(lee1.getValue()));
          lee1.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          lexOutStream.writeNextEntry(term1, lee1.getValue());
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
       
        } else if (lexicographicalCompare > 0) {
          //write to inverted file postings for the term that only occurs in 2nd index
          //docids are transformed as we go.
          BitIndexPointer newPointer =
            invOS.writePostings(inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
          lee2.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
         
          int newCode = newCodes++;
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
          lee2.getValue().setTermId(newCode);
          lexOutStream.writeNextEntry(term2, lee2.getValue());
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        } else {
          //write to postings for a term that occurs in both indices
         
          //1. postings from the first index are unchanged
          IterablePosting ip1 = inverted1.getPostings(lee1.getValue());
          BitIndexPointer newPointer1 = invOS.writePostings(ip1);
         
          //2. postings from the 2nd index have their docids transformed
          IterablePosting ip2 = inverted2.getPostings(lee2.getValue());
          BitIndexPointer newPointer2 = invOS.writePostings(ip2, ip1.getId() - numberOfDocs1);
         
          numberOfPointers+= newPointer1.getNumberOfEntries() + newPointer2.getNumberOfEntries();
           
          //don't set numberOfEntries, as LexiconEntry.add() will take care of this.
          lee1.getValue().setPointer(newPointer1);
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), lee1.getValue().getTermId());
         
          lee1.getValue().add(lee2.getValue());
          lexOutStream.writeNextEntry(term1, lee1.getValue());
         
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
         
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        }
      }
     
      if (hasMore1) {
        lee2 = null;
        while (hasMore1) {
          //write to inverted file as well.
          BitIndexPointer newPointer = invOS.writePostings(
              inverted1.getPostings(lee1.getValue()));
          lee1.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
        }
      } else if (hasMore2) {
        lee1 = null;
        while (hasMore2) {
          //write to inverted file as well.
          BitIndexPointer newPointer = invOS.writePostings(
              inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
          lee2.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          int newCode = newCodes++;
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
          lee2.getValue().setTermId(newCode);
          lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue());
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        }   
      }
      IndexUtil.close(lexInStream1);
      IndexUtil.close(lexInStream2);
     

      inverted1.close();
      inverted2.close();
      invOS.close();
     
      destIndex.setIndexProperty("num.Documents", ""+numberOfDocuments);
      destIndex.addIndexStructure(
            "inverted",
            invertedFileInputClass,
View Full Code Here

      for(String property : new String[] {"index.direct.fields.names","index.direct.fields.count" } )
      {
        destIndex.setIndexProperty(property, srcIndex1.getIndexProperty(property, null));
      }
     
      DirectInvertedOutputStream dfOutput = null;
      try{
        dfOutput =
          (fieldCount > 0 ? fieldDirectFileOutputStreamClass : directFileOutputStreamClass)
          .getConstructor(String.class)
          .newInstance(destIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + 
                destIndex.getPrefix() + ".direct" + BitIn.USUAL_EXTENSION);
      } catch (Exception e) {
        logger.error("Couldn't create specified DirectInvertedOutputStream", e);
        return;
      }
     
     
      final Iterator<DocumentIndexEntry> docidInput1 = (Iterator<DocumentIndexEntry>)srcIndex1.getIndexStructureInputStream("document");
      final PostingIndexInputStream dfInput1 = (PostingIndexInputStream)srcIndex1.getIndexStructureInputStream("direct");
      final MetaIndex metaInput1 = srcIndex1.getMetaIndex();
     
      int sourceDocid = 0;
      //traversing the direct index, without any change
      while(docidInput1.hasNext())
      {
        BitIndexPointer pointerDF = emptyPointer;
        DocumentIndexEntry die = docidInput1.next();
        if (die.getDocumentLength() > 0)
        {
          pointerDF = dfOutput.writePostings(dfInput1.next());
        }
        die.setBitIndexPointer(pointerDF);
        docidOutput.addEntryToBuffer(die);
        metaBuilder.writeDocumentEntry(metaInput1.getAllItems(sourceDocid));
        sourceDocid++;
      }
      dfInput1.close();
      metaInput1.close();
      IndexUtil.close(docidInput1);
      final Iterator<DocumentIndexEntry> docidInput2 = (Iterator<DocumentIndexEntry>)srcIndex2.getIndexStructureInputStream("document");
      final PostingIndexInputStream dfInput2 = (PostingIndexInputStream)srcIndex2.getIndexStructureInputStream("direct");
      final MetaIndex metaInput2 = srcIndex2.getMetaIndex();
     
      sourceDocid = 0;
      while (docidInput2.hasNext())
      {
        DocumentIndexEntry die = docidInput2.next();
     
        BitIndexPointer pointerDF = emptyPointer;
        if (die.getDocumentLength() > 0)
        {
          final IterablePosting postings = dfInput2.next();
         
          List<Posting> postingList = new ArrayList<Posting>();
          while(postings.next() != IterablePosting.EOL)
          {
            final Posting p = postings.asWritablePosting();
            p.setId(termcodeHashmap.get(postings.getId()));
            postingList.add(p);
          }
          Collections.sort(postingList, new PostingIdComparator());
          pointerDF = dfOutput.writePostings(postingList.iterator());
        }
        die.setBitIndexPointer(pointerDF);
        docidOutput.addEntryToBuffer(die);
        metaBuilder.writeDocumentEntry(metaInput2.getAllItems(sourceDocid));
        sourceDocid++;
      }
      dfInput2.close();
      IndexUtil.close(docidInput2);
      metaInput2.close();
     
      metaBuilder.close();
      dfOutput.close();
      docidOutput.finishedCollections();
      docidOutput.close();

      destIndex.addIndexStructure(
          "direct",
View Full Code Here

TOP

Related Classes of org.terrier.structures.DirectInvertedOutputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.