Examples of org.terrier.structures.Index$UpdatingCollectionStatistics

org.terrier.structures.Index
This collection statistics parses the associated index properties for each call. It doesnt support fields.

   * @throws Exception
   */
  public static void main (String[] args) throws Exception
  {
    Index.setIndexLoadingProfileAsRetrieval(false);
    Index i = Index.createIndex();
    if (i== null)
    {
      System.err.println("Sorry, no index could be found in default location");
      return;
    }
    new BlockInverted2DirectIndexBuilder(i).createDirectIndex();
    i.close();
  }

View Full Code Here

      throws IOException 
  {
    HadoopUtility.loadTerrierJob(job);
    final BitPostingIndexInputSplit split = (BitPostingIndexInputSplit)_split;
    Index.setIndexLoadingProfileAsRetrieval(false);
    final Index index = HadoopUtility.fromHConfiguration(job);
    if (index == null)
      throw new IOException("Index not found in JobConf:" + Index.getLastIndexLoadError());
    
    final String bitPostingStructureName = job.get(BITPOSTING_STRUCTURE_KEY);
    
    final BitPostingIndexInputStream postingStream = (BitPostingIndexInputStream)index.getIndexStructureInputStream(bitPostingStructureName);
    postingStream.skip(split.getStartingEntryIndex());
    //logger.info("BitPostingIndexRecordReader for structure "+ bitPostingStructureName + " start entry "+ split.getStartingEntryIndex() + " split size " + split.getEntryCount());
    return new BitPostingIndexRecordReader(postingStream, split.getStartingEntryIndex(), split.getEntryCount());
  }

View Full Code Here

    HadoopUtility.loadTerrierJob(job);
    
    final String lookupStructureName = job.get(BITPOSTING_LOOKUP_STRUCTURE_KEY);
    final String bitPostingStructureName = job.get(BITPOSTING_STRUCTURE_KEY);
    Index.setIndexLoadingProfileAsRetrieval(false);
    final Index index = HadoopUtility.fromHConfiguration(job);    
    
    final byte fileCount = Byte.parseByte(index.getIndexProperty("index." + bitPostingStructureName + ".data-files", "1"));
    final Path bitPostingStructureFiles[] = new Path[fileCount];
    final FileStatus[] fss = new FileStatus[fileCount];
    final long[] bitPostingStructureFSBlockSizes = new long[fileCount];
    
    //logger.info("Calculating splits of structure " + bitPostingStructureName);
    FileSystem fs = FileSystem.get(job);
    for(byte i=0;i<fileCount;i++)
    {
      bitPostingStructureFiles[i] = new Path(BitPostingIndexInputStream.getFilename(index, bitPostingStructureName, fileCount, i));
      fss[i] = fs.getFileStatus(bitPostingStructureFiles[i]);
      bitPostingStructureFSBlockSizes[i] = getBlockSize(bitPostingStructureFiles[i], fss[i]);
      //logger.info("File " + i + " approx splits=" + ((double)fss[i].getLen() /(double)bitPostingStructureFSBlockSizes[i]));
    }
    
    //this smells of a hack, because we dont have a strategy for naming various index structures streams
    final Iterator<? extends BitIndexPointer> offsetIterator = 
      index.hasIndexStructureInputStream(lookupStructureName+ "-entry")
        ? (Iterator<? extends BitIndexPointer>)index.getIndexStructureInputStream(lookupStructureName+ "-entry")
        : (Iterator<? extends BitIndexPointer>)index.getIndexStructureInputStream(lookupStructureName);


    if (offsetIterator == null)
      throw new IOException("No such stream structure called " + lookupStructureName+ "-entry or "+lookupStructureName+" found in index");
    final List<InputSplit> splitList = new ArrayList<InputSplit>();
    
    int currentId = 0;
    
    //size of the current split of each file
    final long[] blockSize = new long[fileCount];
    //location of the last split for each file
    final long[] bitPostingStructureSplitEndOffsets = new long[fileCount];
    
    //how many entries will be in this split, for each file
    final int[] entriesInBlock = new int[fileCount];
    //what is the starting id of the next entry split, for each file
    final int[] firstEntryOfNextSplit = new int[fileCount];
    
    //number of splits per file, for logging only
    final int[] splitsPerFile = new int[fileCount];
    
    Arrays.fill(firstEntryOfNextSplit, Integer.MAX_VALUE);


    BitIndexPointer currentPointer = null;
    //iterate through the lookup iterator
    //split the target bit posting index structure into chunks of size bitPostingStructureFSBlockSize
    while(offsetIterator.hasNext())
    {      
      //ok, where is the next pointer to
      currentPointer = offsetIterator.next();
      final byte fileId = currentPointer.getFileNumber();
      
      //what is the first entry of the next split of this file?
      firstEntryOfNextSplit[fileId] = Math.min(currentId, firstEntryOfNextSplit[fileId]);
      //this split will have one more entry
      entriesInBlock[fileId]++;
      
      //what is our current offset?
      long offset = currentPointer.getOffset();
      //System.err.println("Offset" + offset);
      //if we made the split here, how big would it be?
      blockSize[fileId] = offset - bitPostingStructureSplitEndOffsets[fileId];
      //is this block is large enough
      if (blockSize[fileId] > bitPostingStructureFSBlockSizes[fileId])
      {
        //yes, its big enough
        //block will be from bitPostingStructureSplitEndOffsets[fileId] to offset, which is blockSize[fileId]
        BlockLocation[] blkLocations = fs.getFileBlockLocations(
          fss[fileId], 
          bitPostingStructureSplitEndOffsets[fileId], 
          blockSize[fileId]);
        splitList.add(
          new BitPostingIndexInputSplit(
            bitPostingStructureFiles[fileId],  //path
            bitPostingStructureSplitEndOffsets[fileId],  //start
            blockSize[fileId],  //length
            blkLocations[0].getHosts(), //hosts
            firstEntryOfNextSplit[fileId], //first entry in this split
            entriesInBlock[fileId]) //number of entries in this split
          );
        //logger.info("File "+ fileId + " split " +(splitList.size()-1)
//          + " "+ splitList.get(splitList.size() -1).toString());
        //record another split for this file (for logging only)
        splitsPerFile[fileId]++;
        //update recording of last offset for this file
        bitPostingStructureSplitEndOffsets[fileId] = offset;
        //reset size of split for this file
        blockSize[fileId] = 0; 
        //reset counter of entries in split of this file
        entriesInBlock[fileId] = 0;
        //reset the first offset of this split
        firstEntryOfNextSplit[fileId] = Integer.MAX_VALUE;
      }
      
      //ids always increment
      currentId++;
    }
    //find any files which have trailing blocks
    for(byte fileId=0;fileId<fileCount;fileId++)
    {
      if (entriesInBlock[fileId] == 0)
        continue;
      assert(firstEntryOfNextSplit[fileId] != Integer.MAX_VALUE);
      
      //block will be from bitPostingStructureSplitEndOffsets[fileId], with length blockSize[fileId]
      BlockLocation[] blkLocations = fs.getFileBlockLocations(fss[fileId], bitPostingStructureSplitEndOffsets[fileId], blockSize[fileId]);
      splitList.add(
          new BitPostingIndexInputSplit(
            bitPostingStructureFiles[fileId], //path of file for split
            bitPostingStructureSplitEndOffsets[fileId], //start offset of this split
            blockSize[fileId], //size of this split
            blkLocations[0].getHosts(), //hosts for this split
            firstEntryOfNextSplit[fileId], //first entry id for this split
            entriesInBlock[fileId]) //number of entries in this split
          );
      //logger.info("File "+ fileId + " trailing split "+ (splitList.size() -1) 
//        + " " + splitList.get(splitList.size() -1).toString());


      //record another split for this file (for logging only)
      splitsPerFile[fileId]++;
    }


    //logger.info("Split "+ bitPostingStructureName+ " (of "+currentId+" entries) into " + splitList.size() + " splits");
    if (fileCount > 1)
    {
      //logger.info("Multiple files of " + bitPostingStructureName + " were split as follows: " + ArrayUtils.join(splitsPerFile, ","));
    }
    assert(splitList.size() > 0);
    index.close();
    return splitList.toArray(new InputSplit[splitList.size()]);
  }

View Full Code Here

  
  /** Test method, runs splits for inverted/lexicon with the command line specified index */
  public static void main(String[] args) throws Exception
  {
    Index.setIndexLoadingProfileAsRetrieval(false);
    Index index = Index.createIndex(args[1], args[2]);
    if (args[0].equals("--splits"))
    {
      JobConf job = HadoopPlugin.getJobFactory(BitPostingIndexInputFormat.class.getSimpleName()).newJob();
      HadoopUtility.toHConfiguration(index, job);
      setStructures(job, "inverted", "lexicon");
      index.close();
      new BitPostingIndexInputFormat().getSplits(job, 100);
    }
    else
    {
      JobConf job = HadoopPlugin.getJobFactory(BitPostingIndexInputFormat.class.getSimpleName()).newJob();
      setStructures(job, "linksin", "linksin-lookup");
      HadoopUtility.toHConfiguration(index, job);
      index.close();
      InputSplit s = new BitPostingIndexInputSplit(
          new Path(args[3]), Long.parseLong(args[4]), Long.parseLong(args[5]), 
          new String[0], Integer.parseInt(args[6]), Integer.parseInt(args[7]));
      RecordReader<IntWritable, IntObjectWrapper<IterablePosting>> rr = new BitPostingIndexInputFormat().getRecordReader(s, job, new Reporter(){
        public InputSplit getInputSplit() throws UnsupportedOperationException {return null;}

View Full Code Here

    {
      System.err.println("Usage: "+Inv2DirectMultiReduce.class.getName()+" <numReduceTasks> [--finish]");
      return;
    }
    Index.setIndexLoadingProfileAsRetrieval(false);
    Index index = Index.createIndex();
    if (index == null)
    {
      System.err.println(Index.getLastIndexLoadError());
      return;
    }

View Full Code Here

  /**
   * Runs the actual query expansion
   * @see org.terrier.querying.PostProcess#process(org.terrier.querying.Manager,org.terrier.querying.SearchRequest)
   */
  public void process(Manager manager, SearchRequest q) {
       Index index = getIndex(manager);
    lastIndex = index;
    documentIndex = index.getDocumentIndex();
    invertedIndex = index.getInvertedIndex();
    lexicon = index.getLexicon();
    collStats = index.getCollectionStatistics(); 
    directIndex = index.getDirectIndex();
    metaIndex = index.getMetaIndex();
    if (directIndex == null)
    {
      logger.error("This index does not have a direct index. Query expansion disabled!!");
      return;
    }

View Full Code Here

   * @throws Exception
   */
  public static void main (String[] args) throws Exception
  {
    Index.setIndexLoadingProfileAsRetrieval(false);
    Index i = Index.createIndex();
    if (i== null)
    {
      System.err.println("Sorry, no index could be found in default location");
      return;
    }
    new Inverted2DirectIndexBuilder(i).createDirectIndex();
    i.close();
  }

View Full Code Here

  {
    final String[] reverseMetaKeys = ApplicationSetup.getProperty("indexer.meta.reverse.keys", "docno").split("\\s*,\\s*");
    Index.setIndexLoadingProfileAsRetrieval(false);
    if (numberOfReduceTasks == 1)
    {      
      Index index = Index.createIndex(destinationIndexPath, ApplicationSetup.TERRIER_INDEX_PREFIX);
      if (index == null)
      {
        throw new IOException("No such index ["+destinationIndexPath+","+ApplicationSetup.TERRIER_INDEX_PREFIX+"]");
      }
      CompressingMetaIndexBuilder.reverseAsMapReduceJob(index, "meta", reverseMetaKeys, jf);
      index.close();
      return;
    }
    //make a list of MR jobs in separate threads
    List<Thread> threads = new ArrayList<Thread>(numberOfReduceTasks);
    for(int i=0;i<numberOfReduceTasks;i++)
    {
      final int id = i;
      threads.add(new Thread() {
          @Override
          public void run() {
            try{
              Index index = Index.createIndex(destinationIndexPath, ApplicationSetup.TERRIER_INDEX_PREFIX+"-"+id);
              CompressingMetaIndexBuilder.reverseAsMapReduceJob(index, "meta", reverseMetaKeys, jf);
              index.close();
            } catch (Exception e) {
              logger.error("Problem finishing meta", e);
              e.printStackTrace();
            }
          }

View Full Code Here

   * Building the inverted file.
   */
  public void createInvertedFile() {
    if (Index.existsIndex(path, prefix))
    {
      Index i = Index.createIndex();
      if (i == null) {}
      else if (i.hasIndexStructure("inverted"))
      {
        logger.fatal("Cannot create an inverted structure while an index with a inverted structure exists at "+path + ","+ prefix);
        return;
      }
      else if (! i.hasIndexStructure("direct"))
      {
        logger.fatal("Cannot create an inverted structure without a direct structure in the index at "+path + ","+ prefix);
        return;
      }
    }

View Full Code Here

   * optimisation of the identifiers assigned to terms.
   */
  public void createDirectFile() {
    if (Index.existsIndex(path, prefix))
    {
      Index i = Index.createIndex();
      if (i == null) {}
      else if (i.hasIndexStructure("direct"))
      {  
        logger.fatal("Cannot create a direct structure while an index with a direct structure exists at "+path + ","+ prefix);
        return;
      }
    }

View Full Code Here

0 1 2 3

TOP

Related Classes of org.terrier.structures.Index$UpdatingCollectionStatistics

org.terrier.applications.HadoopIndexing

org.terrier.applications.TRECIndexing

org.terrier.applications.TrecTerrier

org.terrier.indexing.hadoop.Hadoop_BasicSinglePassIndexer

org.terrier.indexing.Indexer

org.terrier.indexing.IndexTestUtils

org.terrier.indexing.TestIndexers

org.terrier.matching.dsms.DFRDependenceScoreModifier

org.terrier.matching.TestMatching

org.terrier.matching.TestTRECResultsMatching

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.