Package org.terrier.structures

Examples of org.terrier.structures.SimpleBitIndexPointer


      boolean[] blocksfields, final int numberOfReducers, final int numberOfReduceTaskLimits)
      throws IOException, Exception
  {
    Iterator<DocumentIndexEntry> diis = (Iterator<DocumentIndexEntry>)index.getIndexStructureInputStream("document");
    DocumentIndexBuilder dios = new DocumentIndexBuilder(index, "document-df");
    BitIndexPointer pointer = new SimpleBitIndexPointer();
   
    final boolean blocks = blocksfields[0];
    final boolean fields = blocksfields[1];
   
    if (numberOfReducers == 1)
    {
      String outputPrefix = "-0";
      DataInputStream currentStream = new DataInputStream(Files.openFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers"));
      //logger.info("Adding pointers to the document index");
      while(diis.hasNext())
      {
        DocumentIndexEntry die =  diis.next();
        pointer.readFields(currentStream);
        DocumentIndexEntry newDIentry = fields
          ? new FieldDocumentIndexEntry(die)
          : new BasicDocumentIndexEntry(die);
        newDIentry.setOffset(pointer);
        newDIentry.setNumberOfEntries(pointer.getNumberOfEntries());
        dios.addEntryToBuffer(newDIentry);
      }
      //logger.info("Renaming reducer output as direct file");
      Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION);
      Files.rename(
          index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+outputPrefix + BitIn.USUAL_EXTENSION,
          index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION);
      currentStream.close();
      Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers");
    }
    else if (numberOfReducers <= numberOfReduceTaskLimits)
    {
      //logger.info("Merging direct index pointers from "+ numberOfReducers + " reducers");
      final int partitionSize = (int)Math.ceil( (double)(index.getCollectionStatistics().getNumberOfDocuments()) / (double)numberOfReducers);     
      for(byte reduce = 0; reduce < numberOfReducers; reduce++)
      {
        //logger.info("Merging in pointers from reduce task " + reduce);
        String outputPrefix = "-" + reduce;
        DataInputStream currentStream = new DataInputStream(Files.openFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers"));
        for(int docOffset = 0; docOffset < partitionSize && diis.hasNext(); docOffset++)
        {
          DocumentIndexEntry die =  diis.next();
          pointer.readFields(currentStream);
          DocumentIndexEntry newDIentry = fields
            ? new FieldDocumentIndexEntry(die)
            : new BasicDocumentIndexEntry(die);
          newDIentry.setOffset(pointer);
          newDIentry.setFileNumber(reduce);
          newDIentry.setNumberOfEntries(pointer.getNumberOfEntries());
          dios.addEntryToBuffer(newDIentry);
        }
        currentStream.close();
        Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers");
        //logger.info("Renaming direct file part for reduce task " + reduce);
        String sourcePartDFfilename = index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+outputPrefix + BitIn.USUAL_EXTENSION;
        String destPartDFfilename = index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION + reduce;       
        Files.rename(sourcePartDFfilename, destPartDFfilename);
      }
      index.setIndexProperty("index."+targetStructureName+".data-files", ""+numberOfReducers);
      index.flush();
      IndexUtil.close(diis);
    }
    else
    {
      //logger.info("Merging direct index output from "+ numberOfReducers + " reducers");
     
      final int partitionSize = (int)Math.ceil( (double)(index.getCollectionStatistics().getNumberOfDocuments()) / (double)numberOfReducers);
      final OutputStream DFout = Files.writeFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION);
      long finalFileOffset = 0;
     
      for(int reduce = 0; reduce < numberOfReducers; reduce++)
      {
        //logger.info("Copying document index part for reduce task " + reduce);
        String outputPrefix = "-" + reduce;
        DataInputStream currentStream = new DataInputStream(Files.openFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers"));
        for(int docOffset = 0; docOffset < partitionSize && diis.hasNext(); docOffset++)
        {
          DocumentIndexEntry die =  diis.next();
          pointer.readFields(currentStream);
          DocumentIndexEntry newDIentry = fields
            ? new FieldDocumentIndexEntry(die)
            : new BasicDocumentIndexEntry(die);
          newDIentry.setOffset(finalFileOffset + pointer.getOffset(), pointer.getOffsetBits());
          newDIentry.setNumberOfEntries(pointer.getNumberOfEntries());
          dios.addEntryToBuffer(newDIentry);
        }
        currentStream.close();
        Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers");
        //logger.info("Copying direct file part for reduce task " + reduce);
View Full Code Here


    }
    while(actualDocid < targetDocid)
    { 
      //if (logger.isDebugEnabled())
      //  logger.debug("moving forward: target="+targetDocid + " actual="+actualDocid );
      SimpleBitIndexPointer p = new SimpleBitIndexPointer();
      p.setOffset(postingOutputStream.getOffset());
      p.setNumberOfEntries(0);
      p.write(pointerOutputStream);
      //System.err.println("actualDocid="+ actualDocid + " writing empty pointer");
      actualDocid++;
      reporter.progress();
    }
   
    /* this implementation loads all postings for a given document into memory, then sorts them by
     * term id. This is acceptable, as documents are assumed to have sufficiently small postings that
     * they can fit in memory */
   
    List<Posting> postingList = new ArrayList<Posting>();
    int doclen = 0;
    TIntHashSet foundIds = new TIntHashSet();
    while(documentPostings.hasNext())
    {
      final Posting p = documentPostings.next().asWritablePosting();
      //check for duplicate pointers
      if (! foundIds.contains(p.getId()) )
      {
        postingList.add(p);
        doclen += p.getFrequency();
        reporter.progress();
        foundIds.add(p.getId());
      }
      else
      {
        dupPointers++;
      }
View Full Code Here

    if (dupPointers > 0)
      //logger.warn("Received a total of " + dupPointers + " duplicate postings");
    //add trailing entries to the pointers file
    while(actualDocid <= lastDocidInPartion)
    {
      SimpleBitIndexPointer p = new SimpleBitIndexPointer();
      p.setOffset(postingOutputStream.getOffset());
      p.setNumberOfEntries(0);
      p.write(pointerOutputStream);
      actualDocid++;
    }
    postingOutputStream.close();
    pointerOutputStream.close();
  }
View Full Code Here

      LexiconOutputStream<String> los = getLexOutputStream("tmplexicon");
     
      //the temporary data containing the offsets
      DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2")));

      BitIndexPointer pin = new SimpleBitIndexPointer();
      while(lexiconStream.hasNext())
      {
        Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
        LexiconEntry value = lee.getValue();
        pin.readFields(dis);
        value.setPointer(pin);
        los.writeNextEntry(lee.getKey(), value);
      }
      los.close();
      dis.close();
View Full Code Here

    // file should be updated as well with the term frequency and
    // the startOffset pointer
   
    int frequency;
    long numTokens = 0;
    BitIndexPointer p = new SimpleBitIndexPointer();
   
    for (int j = 0; j < processTerms; j++) {
      frequency = 0; // the term frequency
     
      final int[][] tmpMatrix = new int[4+fieldCount][];
      for(int k=0;k<4+fieldCount;k++)
      {
        tmpMatrix[k] = tmpStorage[j][k].toNativeArray();
      }
      tmpStorage[j] = null;
      final int[] tmpMatrix_docids = tmpMatrix[0];
      final int[] tmpMatrix_freqs = tmpMatrix[1];
      final int[] tmpMatrix_blockFreq = tmpMatrix[2+fieldCount];
      final int[] tmpMatrix_blockIds = tmpMatrix[3+fieldCount];
     
      p.setOffset(file.getByteOffset(), file.getBitOffset());
      p.setNumberOfEntries(tmpMatrix_docids.length);
      p.write(dos);

      // write the first entry
      int docid = tmpMatrix_docids[0];
      file.writeGamma(docid + 1);
      int termfreq = tmpMatrix_freqs[0];
View Full Code Here

      //the updated lexicon
      LexiconOutputStream<String> los = getLexOutputStream("tmplexicon");
     
      //the temporary data containing the offsets
      DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2")));
      BitIndexPointer pin = new SimpleBitIndexPointer();
      while(lexiconStream.hasNext())
      {
        Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
        LexiconEntry value = lee.getValue();
        pin.readFields(dis);
        value.setPointer(pin);
        los.writeNextEntry(lee.getKey(), value);
      }
      IndexUtil.close(lexiconStream);
      los.close();
View Full Code Here

    final DataOutputStream dos,
    TIntArrayList[][] tmpStorage,
    final int _processTerms)
    throws IOException
  {
    BitIndexPointer p = new SimpleBitIndexPointer();
    //write to the inverted file. We should note that the lexicon
    //should be updated with the start bit and byte offset for this
    //set of postings.
    int frequency; long numTokens = 0;
    for (int j = 0; j < _processTerms; j++) {

     
      frequency = 0; //the term frequency
     
      final int[][] tmpMatrix = new int[2+fieldCount][];
      for(int k=0;k<2+fieldCount;k++)
      {
        tmpMatrix[k] = tmpStorage[j][k].toNativeArray();
      }
      tmpStorage[j] = null;
     
      final int[] tmpMatrix0 = tmpMatrix[0];
      final int[] tmpMatrix1 = tmpMatrix[1];
     
      p.setOffset(file.getByteOffset(), file.getBitOffset());
      p.setNumberOfEntries(tmpMatrix0.length);
      p.write(dos);

      //THIS IS ALWAYS AN ERROR
      /*
      if (tmpMatrix[0].length == 0)
      {
View Full Code Here

   
      if (! srcIndex1.getIndexProperty("index.meta.key-names", "docno").equals(srcIndex2.getIndexProperty("index.meta.key-names", "docno")))
      {
        throw new Error("Meta fields in source indices must match");
      }
      final BitIndexPointer emptyPointer = new SimpleBitIndexPointer();
     
       
      final int srcFieldCount1 = srcIndex1.getIntIndexProperty("index.direct.fields.count", 0);
      final int srcFieldCount2 = srcIndex1.getIntIndexProperty("index.direct.fields.count", 0);
      if (srcFieldCount1 != srcFieldCount2)
View Full Code Here

      String term1;
      String term2;

      int termId = 0;
     
      Pointer p = new SimpleBitIndexPointer();
   
      hasMore1 = lexInStream1.hasNext();
      hasMore2 = lexInStream2.hasNext();
      Map.Entry<String,LexiconEntry> lee1 = lexInStream1.next();
      Map.Entry<String,LexiconEntry> lee2 = lexInStream2.next();
View Full Code Here

TOP

Related Classes of org.terrier.structures.SimpleBitIndexPointer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.