Package org.terrier.structures

Examples of org.terrier.structures.BitIndexPointer


    //number of splits per file, for logging only
    final int[] splitsPerFile = new int[fileCount];
   
    Arrays.fill(firstEntryOfNextSplit, Integer.MAX_VALUE);

    BitIndexPointer currentPointer = null;
    //iterate through the lookup iterator
    //split the target bit posting index structure into chunks of size bitPostingStructureFSBlockSize
    while(offsetIterator.hasNext())
    {     
      //ok, where is the next pointer to
      currentPointer = offsetIterator.next();
      final byte fileId = currentPointer.getFileNumber();
     
      //what is the first entry of the next split of this file?
      firstEntryOfNextSplit[fileId] = Math.min(currentId, firstEntryOfNextSplit[fileId]);
      //this split will have one more entry
      entriesInBlock[fileId]++;
     
      //what is our current offset?
      long offset = currentPointer.getOffset();
      //System.err.println("Offset" + offset);
      //if we made the split here, how big would it be?
      blockSize[fileId] = offset - bitPostingStructureSplitEndOffsets[fileId];
      //is this block is large enough
      if (blockSize[fileId] > bitPostingStructureFSBlockSizes[fileId])
View Full Code Here


        dupPointers++;
      }
    }
       
    Collections.sort(postingList, new PostingIdComparator());
    BitIndexPointer pointer = postingOutputStream.writePostings(postingList.iterator());
    pointer.write(pointerOutputStream);
    actualDocid++;
  }
View Full Code Here

      boolean[] blocksfields, final int numberOfReducers, final int numberOfReduceTaskLimits)
      throws IOException, Exception
  {
    Iterator<DocumentIndexEntry> diis = (Iterator<DocumentIndexEntry>)index.getIndexStructureInputStream("document");
    DocumentIndexBuilder dios = new DocumentIndexBuilder(index, "document-df");
    BitIndexPointer pointer = new SimpleBitIndexPointer();
   
    final boolean blocks = blocksfields[0];
    final boolean fields = blocksfields[1];
   
    if (numberOfReducers == 1)
    {
      String outputPrefix = "-0";
      DataInputStream currentStream = new DataInputStream(Files.openFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers"));
      //logger.info("Adding pointers to the document index");
      while(diis.hasNext())
      {
        DocumentIndexEntry die =  diis.next();
        pointer.readFields(currentStream);
        DocumentIndexEntry newDIentry = fields
          ? new FieldDocumentIndexEntry(die)
          : new BasicDocumentIndexEntry(die);
        newDIentry.setOffset(pointer);
        newDIentry.setNumberOfEntries(pointer.getNumberOfEntries());
        dios.addEntryToBuffer(newDIentry);
      }
      //logger.info("Renaming reducer output as direct file");
      Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION);
      Files.rename(
          index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+outputPrefix + BitIn.USUAL_EXTENSION,
          index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION);
      currentStream.close();
      Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers");
    }
    else if (numberOfReducers <= numberOfReduceTaskLimits)
    {
      //logger.info("Merging direct index pointers from "+ numberOfReducers + " reducers");
      final int partitionSize = (int)Math.ceil( (double)(index.getCollectionStatistics().getNumberOfDocuments()) / (double)numberOfReducers);     
      for(byte reduce = 0; reduce < numberOfReducers; reduce++)
      {
        //logger.info("Merging in pointers from reduce task " + reduce);
        String outputPrefix = "-" + reduce;
        DataInputStream currentStream = new DataInputStream(Files.openFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers"));
        for(int docOffset = 0; docOffset < partitionSize && diis.hasNext(); docOffset++)
        {
          DocumentIndexEntry die =  diis.next();
          pointer.readFields(currentStream);
          DocumentIndexEntry newDIentry = fields
            ? new FieldDocumentIndexEntry(die)
            : new BasicDocumentIndexEntry(die);
          newDIentry.setOffset(pointer);
          newDIentry.setFileNumber(reduce);
          newDIentry.setNumberOfEntries(pointer.getNumberOfEntries());
          dios.addEntryToBuffer(newDIentry);
        }
        currentStream.close();
        Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers");
        //logger.info("Renaming direct file part for reduce task " + reduce);
        String sourcePartDFfilename = index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+outputPrefix + BitIn.USUAL_EXTENSION;
        String destPartDFfilename = index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION + reduce;       
        Files.rename(sourcePartDFfilename, destPartDFfilename);
      }
      index.setIndexProperty("index."+targetStructureName+".data-files", ""+numberOfReducers);
      index.flush();
      IndexUtil.close(diis);
    }
    else
    {
      //logger.info("Merging direct index output from "+ numberOfReducers + " reducers");
     
      final int partitionSize = (int)Math.ceil( (double)(index.getCollectionStatistics().getNumberOfDocuments()) / (double)numberOfReducers);
      final OutputStream DFout = Files.writeFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION);
      long finalFileOffset = 0;
     
      for(int reduce = 0; reduce < numberOfReducers; reduce++)
      {
        //logger.info("Copying document index part for reduce task " + reduce);
        String outputPrefix = "-" + reduce;
        DataInputStream currentStream = new DataInputStream(Files.openFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers"));
        for(int docOffset = 0; docOffset < partitionSize && diis.hasNext(); docOffset++)
        {
          DocumentIndexEntry die =  diis.next();
          pointer.readFields(currentStream);
          DocumentIndexEntry newDIentry = fields
            ? new FieldDocumentIndexEntry(die)
            : new BasicDocumentIndexEntry(die);
          newDIentry.setOffset(finalFileOffset + pointer.getOffset(), pointer.getOffsetBits());
          newDIentry.setNumberOfEntries(pointer.getNumberOfEntries());
          dios.addEntryToBuffer(newDIentry);
        }
        currentStream.close();
        Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers");
        //logger.info("Copying direct file part for reduce task " + reduce);
View Full Code Here

      LexiconOutputStream<String> los = getLexOutputStream("tmplexicon");
     
      //the temporary data containing the offsets
      DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2")));

      BitIndexPointer pin = new SimpleBitIndexPointer();
      while(lexiconStream.hasNext())
      {
        Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
        LexiconEntry value = lee.getValue();
        pin.readFields(dis);
        value.setPointer(pin);
        los.writeNextEntry(lee.getKey(), value);
      }
      los.close();
      dis.close();
View Full Code Here

    // file should be updated as well with the term frequency and
    // the startOffset pointer
   
    int frequency;
    long numTokens = 0;
    BitIndexPointer p = new SimpleBitIndexPointer();
   
    for (int j = 0; j < processTerms; j++) {
      frequency = 0; // the term frequency
     
      final int[][] tmpMatrix = new int[4+fieldCount][];
      for(int k=0;k<4+fieldCount;k++)
      {
        tmpMatrix[k] = tmpStorage[j][k].toNativeArray();
      }
      tmpStorage[j] = null;
      final int[] tmpMatrix_docids = tmpMatrix[0];
      final int[] tmpMatrix_freqs = tmpMatrix[1];
      final int[] tmpMatrix_blockFreq = tmpMatrix[2+fieldCount];
      final int[] tmpMatrix_blockIds = tmpMatrix[3+fieldCount];
     
      p.setOffset(file.getByteOffset(), file.getBitOffset());
      p.setNumberOfEntries(tmpMatrix_docids.length);
      p.write(dos);

      // write the first entry
      int docid = tmpMatrix_docids[0];
      file.writeGamma(docid + 1);
      int termfreq = tmpMatrix_freqs[0];
View Full Code Here

      //the updated lexicon
      LexiconOutputStream<String> los = getLexOutputStream("tmplexicon");
     
      //the temporary data containing the offsets
      DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2")));
      BitIndexPointer pin = new SimpleBitIndexPointer();
      while(lexiconStream.hasNext())
      {
        Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
        LexiconEntry value = lee.getValue();
        pin.readFields(dis);
        value.setPointer(pin);
        los.writeNextEntry(lee.getKey(), value);
      }
      IndexUtil.close(lexiconStream);
      los.close();
View Full Code Here

    final DataOutputStream dos,
    TIntArrayList[][] tmpStorage,
    final int _processTerms)
    throws IOException
  {
    BitIndexPointer p = new SimpleBitIndexPointer();
    //write to the inverted file. We should note that the lexicon
    //should be updated with the start bit and byte offset for this
    //set of postings.
    int frequency; long numTokens = 0;
    for (int j = 0; j < _processTerms; j++) {

     
      frequency = 0; //the term frequency
     
      final int[][] tmpMatrix = new int[2+fieldCount][];
      for(int k=0;k<2+fieldCount;k++)
      {
        tmpMatrix[k] = tmpStorage[j][k].toNativeArray();
      }
      tmpStorage[j] = null;
     
      final int[] tmpMatrix0 = tmpMatrix[0];
      final int[] tmpMatrix1 = tmpMatrix[1];
     
      p.setOffset(file.getByteOffset(), file.getBitOffset());
      p.setNumberOfEntries(tmpMatrix0.length);
      p.write(dos);

      //THIS IS ALWAYS AN ERROR
      /*
      if (tmpMatrix[0].length == 0)
      {
View Full Code Here

  protected void indexDocument(Map<String,String> docProperties, DocumentPostingList _termsInDocument) throws Exception
  {
    /* add words to lexicontree */
    lexiconBuilder.addDocumentTerms(_termsInDocument);
    /* add doc postings to the direct index */
    BitIndexPointer dirIndexPost = directIndexBuilder.writePostings(_termsInDocument.getPostings2());
      //.addDocument(termsInDocument.getPostings());
    /* add doc to documentindex */
    DocumentIndexEntry die = _termsInDocument.getDocumentStatistics();
    die.setBitIndexPointer(dirIndexPost);
    docIndexBuilder.addEntryToBuffer(die);
View Full Code Here

  protected void indexDocument(Map<String,String> docProperties, DocumentPostingList _termsInDocument) throws Exception
  {
    /* add words to lexicontree */
    lexiconBuilder.addDocumentTerms(_termsInDocument);
    /* add doc postings to the direct index */
    BitIndexPointer dirIndexPost = directIndexBuilder.writePostings(_termsInDocument.getPostings2());
    /* add doc to documentindex */
    DocumentIndexEntry die = _termsInDocument.getDocumentStatistics();
    die.setBitIndexPointer(dirIndexPost);
    docIndexBuilder.addEntryToBuffer(die);
    /** add doc metadata to index */
 
View Full Code Here

        term2 = lee2.getKey();
       
        int lexicographicalCompare = term1.compareTo(term2);
        if (lexicographicalCompare < 0) {
          //write to inverted file postings for the term that only occurs in 1st index
          BitIndexPointer newPointer = invOS.writePostings(inverted1.getPostings(lee1.getValue()));
          lee1.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          lexOutStream.writeNextEntry(term1, lee1.getValue());
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
       
        } else if (lexicographicalCompare > 0) {
          //write to inverted file postings for the term that only occurs in 2nd index
          //docids are transformed as we go.
          BitIndexPointer newPointer =
            invOS.writePostings(inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
          lee2.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
         
          int newCode = newCodes++;
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
          lee2.getValue().setTermId(newCode);
          lexOutStream.writeNextEntry(term2, lee2.getValue());
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        } else {
          //write to postings for a term that occurs in both indices
         
          //1. postings from the first index are unchanged
          IterablePosting ip1 = inverted1.getPostings(lee1.getValue());
          BitIndexPointer newPointer1 = invOS.writePostings(ip1);
         
          //2. postings from the 2nd index have their docids transformed
          IterablePosting ip2 = inverted2.getPostings(lee2.getValue());
          BitIndexPointer newPointer2 = invOS.writePostings(ip2, ip1.getId() - numberOfDocs1);
         
          numberOfPointers+= newPointer1.getNumberOfEntries() + newPointer2.getNumberOfEntries();
           
          //don't set numberOfEntries, as LexiconEntry.add() will take care of this.
          lee1.getValue().setPointer(newPointer1);
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), lee1.getValue().getTermId());
         
          lee1.getValue().add(lee2.getValue());
          lexOutStream.writeNextEntry(term1, lee1.getValue());
         
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
         
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        }
      }
     
      if (hasMore1) {
        lee2 = null;
        while (hasMore1) {
          //write to inverted file as well.
          BitIndexPointer newPointer = invOS.writePostings(
              inverted1.getPostings(lee1.getValue()));
          lee1.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
        }
      } else if (hasMore2) {
        lee1 = null;
        while (hasMore2) {
          //write to inverted file as well.
          BitIndexPointer newPointer = invOS.writePostings(
              inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
          lee2.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          int newCode = newCodes++;
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
          lee2.getValue().setTermId(newCode);
          lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue());
View Full Code Here

TOP

Related Classes of org.terrier.structures.BitIndexPointer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.