Examples of org.terrier.structures.SimpleBitIndexPointer$Factory

org.terrier.structures.SimpleBitIndexPointer
Factory class for {@link SimpleBitIndexPointer}. Total size is 13 bytes

      boolean[] blocksfields, final int numberOfReducers, final int numberOfReduceTaskLimits)
      throws IOException, Exception 
  {
    Iterator<DocumentIndexEntry> diis = (Iterator<DocumentIndexEntry>)index.getIndexStructureInputStream("document");
    DocumentIndexBuilder dios = new DocumentIndexBuilder(index, "document-df");
    BitIndexPointer pointer = new SimpleBitIndexPointer();
    
    final boolean blocks = blocksfields[0];
    final boolean fields = blocksfields[1];
    
    if (numberOfReducers == 1)
    {
      String outputPrefix = "-0";
      DataInputStream currentStream = new DataInputStream(Files.openFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers"));
      //logger.info("Adding pointers to the document index");
      while(diis.hasNext())
      {
        DocumentIndexEntry die =  diis.next();
        pointer.readFields(currentStream);
        DocumentIndexEntry newDIentry = fields
          ? new FieldDocumentIndexEntry(die)
          : new BasicDocumentIndexEntry(die);
        newDIentry.setOffset(pointer);
        newDIentry.setNumberOfEntries(pointer.getNumberOfEntries());
        dios.addEntryToBuffer(newDIentry);
      }
      //logger.info("Renaming reducer output as direct file");
      Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION);
      Files.rename(
          index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+outputPrefix + BitIn.USUAL_EXTENSION, 
          index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION);
      currentStream.close();
      Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers");
    }
    else if (numberOfReducers <= numberOfReduceTaskLimits)
    {
      //logger.info("Merging direct index pointers from "+ numberOfReducers + " reducers");
      final int partitionSize = (int)Math.ceil( (double)(index.getCollectionStatistics().getNumberOfDocuments()) / (double)numberOfReducers);      
      for(byte reduce = 0; reduce < numberOfReducers; reduce++)
      {
        //logger.info("Merging in pointers from reduce task " + reduce);
        String outputPrefix = "-" + reduce;
        DataInputStream currentStream = new DataInputStream(Files.openFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers"));
        for(int docOffset = 0; docOffset < partitionSize && diis.hasNext(); docOffset++)
        {
          DocumentIndexEntry die =  diis.next();
          pointer.readFields(currentStream);
          DocumentIndexEntry newDIentry = fields
            ? new FieldDocumentIndexEntry(die)
            : new BasicDocumentIndexEntry(die);
          newDIentry.setOffset(pointer);
          newDIentry.setFileNumber(reduce);
          newDIentry.setNumberOfEntries(pointer.getNumberOfEntries());
          dios.addEntryToBuffer(newDIentry);
        }
        currentStream.close();
        Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers");
        //logger.info("Renaming direct file part for reduce task " + reduce);
        String sourcePartDFfilename = index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+outputPrefix + BitIn.USUAL_EXTENSION;
        String destPartDFfilename = index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION + reduce;        
        Files.rename(sourcePartDFfilename, destPartDFfilename);
      }
      index.setIndexProperty("index."+targetStructureName+".data-files", ""+numberOfReducers);
      index.flush();
      IndexUtil.close(diis);
    }
    else
    {
      //logger.info("Merging direct index output from "+ numberOfReducers + " reducers");
      
      final int partitionSize = (int)Math.ceil( (double)(index.getCollectionStatistics().getNumberOfDocuments()) / (double)numberOfReducers);
      final OutputStream DFout = Files.writeFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName+ BitIn.USUAL_EXTENSION);
      long finalFileOffset = 0;
      
      for(int reduce = 0; reduce < numberOfReducers; reduce++)
      {
        //logger.info("Copying document index part for reduce task " + reduce);
        String outputPrefix = "-" + reduce;
        DataInputStream currentStream = new DataInputStream(Files.openFileStream(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers"));
        for(int docOffset = 0; docOffset < partitionSize && diis.hasNext(); docOffset++)
        {
          DocumentIndexEntry die =  diis.next();
          pointer.readFields(currentStream);
          DocumentIndexEntry newDIentry = fields
            ? new FieldDocumentIndexEntry(die)
            : new BasicDocumentIndexEntry(die);
          newDIentry.setOffset(finalFileOffset + pointer.getOffset(), pointer.getOffsetBits());
          newDIentry.setNumberOfEntries(pointer.getNumberOfEntries());
          dios.addEntryToBuffer(newDIentry);
        }
        currentStream.close();
        Files.delete(index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + targetStructureName +outputPrefix+ ".pointers");
        //logger.info("Copying direct file part for reduce task " + reduce);

View Full Code Here

    }
    while(actualDocid < targetDocid)
    {  
      //if (logger.isDebugEnabled())
      //  logger.debug("moving forward: target="+targetDocid + " actual="+actualDocid );
      SimpleBitIndexPointer p = new SimpleBitIndexPointer();
      p.setOffset(postingOutputStream.getOffset());
      p.setNumberOfEntries(0);
      p.write(pointerOutputStream);
      //System.err.println("actualDocid="+ actualDocid + " writing empty pointer");
      actualDocid++;
      reporter.progress();
    }
    
    /* this implementation loads all postings for a given document into memory, then sorts them by
     * term id. This is acceptable, as documents are assumed to have sufficiently small postings that
     * they can fit in memory */
    
    List<Posting> postingList = new ArrayList<Posting>();
    int doclen = 0;
    TIntHashSet foundIds = new TIntHashSet();
    while(documentPostings.hasNext())
    {
      final Posting p = documentPostings.next().asWritablePosting();
      //check for duplicate pointers
      if (! foundIds.contains(p.getId()) )
      {
        postingList.add(p);
        doclen += p.getFrequency();
        reporter.progress();
        foundIds.add(p.getId());
      }
      else
      {
        dupPointers++;
      }

View Full Code Here

    if (dupPointers > 0)
      //logger.warn("Received a total of " + dupPointers + " duplicate postings");
    //add trailing entries to the pointers file
    while(actualDocid <= lastDocidInPartion)
    {
      SimpleBitIndexPointer p = new SimpleBitIndexPointer();
      p.setOffset(postingOutputStream.getOffset());
      p.setNumberOfEntries(0);
      p.write(pointerOutputStream);
      actualDocid++;
    }
    postingOutputStream.close();
    pointerOutputStream.close();
  }

View Full Code Here

      LexiconOutputStream<String> los = getLexOutputStream("tmplexicon");
      
      //the temporary data containing the offsets
      DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2")));


      BitIndexPointer pin = new SimpleBitIndexPointer();
      while(lexiconStream.hasNext())
      {
        Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
        LexiconEntry value = lee.getValue();
        pin.readFields(dis);
        value.setPointer(pin);
        los.writeNextEntry(lee.getKey(), value);
      }
      los.close();
      dis.close();

View Full Code Here

    // file should be updated as well with the term frequency and
    // the startOffset pointer
    
    int frequency;
    long numTokens = 0;
    BitIndexPointer p = new SimpleBitIndexPointer();
    
    for (int j = 0; j < processTerms; j++) {
      frequency = 0; // the term frequency
      
      final int[][] tmpMatrix = new int[4+fieldCount][];
      for(int k=0;k<4+fieldCount;k++)
      {
        tmpMatrix[k] = tmpStorage[j][k].toNativeArray();
      }
      tmpStorage[j] = null;
      final int[] tmpMatrix_docids = tmpMatrix[0];
      final int[] tmpMatrix_freqs = tmpMatrix[1];
      final int[] tmpMatrix_blockFreq = tmpMatrix[2+fieldCount];
      final int[] tmpMatrix_blockIds = tmpMatrix[3+fieldCount];
      
      p.setOffset(file.getByteOffset(), file.getBitOffset());
      p.setNumberOfEntries(tmpMatrix_docids.length);
      p.write(dos);


      // write the first entry
      int docid = tmpMatrix_docids[0];
      file.writeGamma(docid + 1);
      int termfreq = tmpMatrix_freqs[0];

View Full Code Here

      //the updated lexicon
      LexiconOutputStream<String> los = getLexOutputStream("tmplexicon");
      
      //the temporary data containing the offsets
      DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2")));
      BitIndexPointer pin = new SimpleBitIndexPointer();
      while(lexiconStream.hasNext())
      {
        Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
        LexiconEntry value = lee.getValue();
        pin.readFields(dis);
        value.setPointer(pin);
        los.writeNextEntry(lee.getKey(), value);
      }
      IndexUtil.close(lexiconStream);
      los.close();

View Full Code Here

    final DataOutputStream dos, 
    TIntArrayList[][] tmpStorage, 
    final int _processTerms)
    throws IOException
  {
    BitIndexPointer p = new SimpleBitIndexPointer();
    //write to the inverted file. We should note that the lexicon 
    //should be updated with the start bit and byte offset for this
    //set of postings.
    int frequency; long numTokens = 0;
    for (int j = 0; j < _processTerms; j++) {


      
      frequency = 0; //the term frequency
      
      final int[][] tmpMatrix = new int[2+fieldCount][]; 
      for(int k=0;k<2+fieldCount;k++)
      {
        tmpMatrix[k] = tmpStorage[j][k].toNativeArray();
      }
      tmpStorage[j] = null;
      
      final int[] tmpMatrix0 = tmpMatrix[0];
      final int[] tmpMatrix1 = tmpMatrix[1];
      
      p.setOffset(file.getByteOffset(), file.getBitOffset());
      p.setNumberOfEntries(tmpMatrix0.length);
      p.write(dos);


      //THIS IS ALWAYS AN ERROR
      /*
      if (tmpMatrix[0].length == 0)
      {

View Full Code Here

    
      if (! srcIndex1.getIndexProperty("index.meta.key-names", "docno").equals(srcIndex2.getIndexProperty("index.meta.key-names", "docno")))
      {
        throw new Error("Meta fields in source indices must match");
      }
      final BitIndexPointer emptyPointer = new SimpleBitIndexPointer();
      
        
      final int srcFieldCount1 = srcIndex1.getIntIndexProperty("index.direct.fields.count", 0);
      final int srcFieldCount2 = srcIndex1.getIntIndexProperty("index.direct.fields.count", 0);
      if (srcFieldCount1 != srcFieldCount2)

View Full Code Here

      String term1;
      String term2;


      int termId = 0;
      
      Pointer p = new SimpleBitIndexPointer();
    
      hasMore1 = lexInStream1.hasNext(); 
      hasMore2 = lexInStream2.hasNext(); 
      Map.Entry<String,LexiconEntry> lee1 = lexInStream1.next();
      Map.Entry<String,LexiconEntry> lee2 = lexInStream2.next();

View Full Code Here

TOP

Related Classes of org.terrier.structures.SimpleBitIndexPointer$Factory

asia.redact.bracket.properties.alt.DotPropertiesParser

asia.redact.bracket.properties.line.LineScanner

asia.redact.bracket.properties.mgmt.Attributes

asia.redact.bracket.properties.mgmt.LoadList

avrora.sim.clock.ClockDomain

avrora.sim.mcu.ATMega128

backtype.storm.tuple.Values

com.eviware.soapui.impl.wadl.inference.schema.content.EmptyContent

com.eviware.soapui.impl.wadl.inference.schema.particles.AttributeParticle

com.eviware.soapui.impl.wadl.inference.schema.particles.ElementParticle

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.