Package org.terrier.structures

Examples of org.terrier.structures.PostingIndex


      LexiconOutputStream<String> lexOutStream =
        new FSOMapFileLexiconOutputStream(destIndex, "lexicon", (Class <FixedSizeWriteableFactory<LexiconEntry>>) lvf.getClass());

      int newCodes = (int)srcIndex1.getCollectionStatistics().getNumberOfUniqueTerms();
     
      PostingIndex inverted1 = srcIndex1.getInvertedIndex();
      PostingIndex inverted2 = srcIndex2.getInvertedIndex();
     
      DirectInvertedOutputStream invOS =null;
      try{
        invOS = (fieldCount > 0 ? fieldInvertedFileOutputStreamClass : invertedFileOutputStreamClass)
          .getConstructor(String.class)
          .newInstance(destIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + 
            destIndex.getPrefix() + ".inverted"+ BitIn.USUAL_EXTENSION);
       
      } catch (Exception e) {
        logger.error("Couldn't create specified DirectInvertedOutputStream", e);
        return;
      }


      boolean hasMore1 = false;
      boolean hasMore2 = false;
      String term1;
      String term2;
      Map.Entry<String,LexiconEntry> lee1 = null;
      Map.Entry<String,LexiconEntry> lee2 = null;
      hasMore1 = lexInStream1.hasNext();
      if (hasMore1)
        lee1 = lexInStream1.next();
      hasMore2 = lexInStream2.hasNext();
      if (hasMore2)
        lee2 = lexInStream2.next();
      while (hasMore1 && hasMore2) {
   
        term1 = lee1.getKey();
        term2 = lee2.getKey();
       
        int lexicographicalCompare = term1.compareTo(term2);
        if (lexicographicalCompare < 0) {
          //write to inverted file postings for the term that only occurs in 1st index
          BitIndexPointer newPointer = invOS.writePostings(inverted1.getPostings(lee1.getValue()));
          lee1.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          lexOutStream.writeNextEntry(term1, lee1.getValue());
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
       
        } else if (lexicographicalCompare > 0) {
          //write to inverted file postings for the term that only occurs in 2nd index
          //docids are transformed as we go.
          BitIndexPointer newPointer =
            invOS.writePostings(inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
          lee2.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
         
          int newCode = newCodes++;
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
          lee2.getValue().setTermId(newCode);
          lexOutStream.writeNextEntry(term2, lee2.getValue());
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        } else {
          //write to postings for a term that occurs in both indices
         
          //1. postings from the first index are unchanged
          IterablePosting ip1 = inverted1.getPostings(lee1.getValue());
          BitIndexPointer newPointer1 = invOS.writePostings(ip1);
         
          //2. postings from the 2nd index have their docids transformed
          IterablePosting ip2 = inverted2.getPostings(lee2.getValue());
          BitIndexPointer newPointer2 = invOS.writePostings(ip2, ip1.getId() - numberOfDocs1);
         
          numberOfPointers+= newPointer1.getNumberOfEntries() + newPointer2.getNumberOfEntries();
           
          //don't set numberOfEntries, as LexiconEntry.add() will take care of this.
          lee1.getValue().setPointer(newPointer1);
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), lee1.getValue().getTermId());
         
          lee1.getValue().add(lee2.getValue());
          lexOutStream.writeNextEntry(term1, lee1.getValue());
         
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
         
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        }
      }
     
      if (hasMore1) {
        lee2 = null;
        while (hasMore1) {
          //write to inverted file as well.
          BitIndexPointer newPointer = invOS.writePostings(
              inverted1.getPostings(lee1.getValue()));
          lee1.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
        }
      } else if (hasMore2) {
        lee1 = null;
        while (hasMore2) {
          //write to inverted file as well.
          BitIndexPointer newPointer = invOS.writePostings(
              inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
          lee2.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          int newCode = newCodes++;
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
          lee2.getValue().setTermId(newCode);
          lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue());
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        }   
      }
      IndexUtil.close(lexInStream1);
      IndexUtil.close(lexInStream2);
     

      inverted1.close();
      inverted2.close();
      invOS.close();
     
      destIndex.setIndexProperty("num.Documents", ""+numberOfDocuments);
      destIndex.addIndexStructure(
            "inverted",
View Full Code Here

TOP

Related Classes of org.terrier.structures.PostingIndex

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.