Examples of ivory.core.RetrievalEnvironment

ivory.core.RetrievalEnvironment
@author Don Metzler @author Jimmy Lin

    if (!fs.exists(p)) {
      sLogger.info("index directory doesn't exist, creating...");
      fs.mkdirs(p);
    }


    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);


    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();


    if (!fs.exists(p)) {
      sLogger.info("docno-mapping.dat doesn't exist, creating...");
      String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(),
          new Integer(numMappers).toString() };
      NumberTextDocuments tool = new NumberTextDocuments();
      tool.setConf(conf);
      tool.run(arr);


      fs.delete(mappingDir, true);
    }


    // Now we're ready to start the preprocessing pipeline... set
    // appropriate properties.
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);


    conf.set("Ivory.CollectionName", "TextCollection");
    conf.set("Ivory.CollectionPath", collection);
    conf.set("Ivory.IndexPath", indexRootPath);
    conf.set("Ivory.InputFormat", "edu.umd.cloud9.collection.line.TextDocumentInputFormat");
    conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer");
    conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.line.TextDocnoMapping");
    conf.set("Ivory.DocnoMappingFile", env.getDocnoMappingData().toString());
    conf.set("Ivory.DocnoMappingFile", "");


    conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1
    conf.setInt("Ivory.MinDf", 2); // toss away singleton terms
    conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);

View Full Code Here

    } else {
      LOG.info("Index directory " + p + " already exists!");
      return -1;
    }


    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);
    Path mappingFile = env.getDocnoMappingData();
    new TrecDocnoMappingBuilder().build(new Path(collection), mappingFile, conf);


    conf.set(Constants.CollectionName, "TREC_vol45");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, TrecDocumentInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, TrecDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());


    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 2); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);

View Full Code Here

    } else {
      LOG.info("Index directory " + p + " already exists!");
      return -1;
    }


    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);
    Path mappingFile = env.getDocnoMappingData();
    new TrecWebDocnoMappingBuilder().build(new Path(collection), mappingFile, conf);


    conf.set(Constants.CollectionName, "Gov2");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);

View Full Code Here

    } else {
      LOG.info("Index directory " + p + " already exists!");
      return -1;
    }


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);


    conf.set(Constants.CollectionName, collectionName);
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, inputFormatClass.getCanonicalName());
    conf.set(Constants.Tokenizer, tokenizerClass.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, docnoMappingClass.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());


    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, minDf); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);


    Path mappingFile = env.getDocnoMappingData();
    docnoMappingClass.newInstance().getBuilder().build(new Path(collection), mappingFile, conf);


    new BuildTermDocVectors(conf).run();
    new ComputeGlobalTermStatistics(conf).run();
    new BuildDictionary(conf).run();

View Full Code Here

      Configuration conf = new Configuration();
      FileSystem fs = FileSystem.get(conf);


      sLogger.info(p);
      String indexPath = p.toString();
      RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);


      Path termsFilePath = new Path(env.getIndexTermsData());
      Path dfByTermFilePath = new Path(env.getDfByTermData());
      Path cfByTermFilePath = new Path(env.getCfByTermData());


      FSDataInputStream in = fs.open(termsFilePath);
      FSDataInputStream inDfs = fs.open(dfByTermFilePath);
      FSDataInputStream inCfs = fs.open(cfByTermFilePath);

View Full Code Here

    // PowerTool
    JobConf conf = new JobConf(getConf(), BuildWeightedIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);


    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedIntDocVectorsDirectory();
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String collectionName = conf.get("Ivory.CollectionName");




    sLogger.info("Characteristics of the collection:");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info("Characteristics of the job:");
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - MinSplitSize: " + minSplitSize);


    String dfByIntFilePath = env.getDfByIntData();
    String cfByIntFilePath = env.getCfByIntData();


    /* add df table to cache */
    if (!fs.exists(new Path(dfByIntFilePath))) {
      throw new RuntimeException("Error, df data file " + dfByIntFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByIntFilePath), conf);


    /* add cf table to cache */
    if (!fs.exists(new Path(cfByIntFilePath))) {
      throw new RuntimeException("Error, cf data file " + cfByIntFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(cfByIntFilePath), conf);


    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
      throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);


    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);


    if (fs.exists(weightedVectorsPath)) {
      sLogger.info("Output path already exists!");
      return 0;

View Full Code Here

  private final Configuration conf;


  public IntPostingsForwardIndex(String indexPath, FileSystem fs) throws IOException {
    this.fs = fs;
    this.conf = fs.getConf();
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    postingsPath = env.getPostingsDirectory();


    FSDataInputStream posInput = fs.open(new Path(env.getPostingsIndexData()));


    int l = posInput.readInt();
    positions = new long[l];
    for (int i = 0; i < l; i++) {
      positions[i] = posInput.readLong();

View Full Code Here

        if (conf.get ("mapred.job.tracker").equals ("local")) {
          FileSystem fs = FileSystem.get (conf);
          //sLogger.info ("fs: " + fs);
          String indexPath = conf.get ("Ivory.IndexPath");
          //sLogger.info ("indexPath: " + indexPath);
          RetrievalEnvironment env = new RetrievalEnvironment (indexPath, fs);
          //          sLogger.info ("env: " + env);
          localFiles = new Path [3];
          localFiles [0] = new Path (env.getDfByIntData ());
          localFiles [1] = new Path (env.getCfByIntData ());
          localFiles [2] = env.getDoclengthsData ();
        } else {
          localFiles = DistributedCache.getLocalCacheFiles (conf);
        }
      } catch (IOException e2) {
        throw new RuntimeException("Local cache files not read properly.");

View Full Code Here

    Configuration conf = getConf();


    FileSystem fs = FileSystem.get(conf);


    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);


    int reduceTasks = 10;


    String collectionName = env.readCollectionName();
    String termDocVectorsPath = env.getTermDocVectorsDirectory();
    String termDfCfPath = env.getTermDfCfDirectory();


    if (!fs.exists(new Path(indexPath))) {
      LOG.info("index path doesn't existing: skipping!");
      return 0;
    }


    LOG.info("PowerTool: " + ComputeGlobalTermStatistics.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));


    Path outputPath = new Path(termDfCfPath);
    if (fs.exists(outputPath)) {
      LOG.info("TermDfCf directory exist: skipping!");
      return 0;
    }


    Job job = new Job(getConf(), ComputeGlobalTermStatistics.class.getSimpleName() + ":"
        + collectionName);
    job.setJarByClass(ComputeGlobalTermStatistics.class);


    job.setNumReduceTasks(reduceTasks);


    FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath));
    FileOutputFormat.setOutputPath(job, outputPath);


    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);


    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfIntLong.class);


    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);


    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");


    Counters counters = job.getCounters();
    // Write out number of postings. NOTE: this value is not the same as
    // number of postings, because postings for non-English terms are
    // discarded, or as result of df cut.
    env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue());


    env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue());
    return 0;
  }

View Full Code Here

  public IntDocVectorsForwardIndex(String indexPath, FileSystem fs, boolean weighted)
      throws IOException {
    this.fs = Preconditions.checkNotNull(fs);
    this.conf = fs.getConf();


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    path = (weighted ? env.getWeightedIntDocVectorsDirectory() : env.getIntDocVectorsDirectory());


    String forwardIndexPath = (weighted ? env.getWeightedIntDocVectorsForwardIndex()
        : env.getIntDocVectorsForwardIndex());
    FSDataInputStream posInput = fs.open(new Path(forwardIndexPath));


    docnoOffset = posInput.readInt();
    collectionDocumentCount = posInput.readInt();

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of ivory.core.RetrievalEnvironment

edu.umd.cloud9.collection.DocnoMapping

ivory.app.PreprocessClueWebEnglish

ivory.app.PreprocessClueWebEnglishMultipleSegments

ivory.app.PreprocessCollection

ivory.app.PreprocessMedline

ivory.app.PreprocessTextCollection

ivory.app.PreprocessTrecForeign

ivory.app.PreprocessWikipedia

ivory.bloomir.preprocessing.GenerateBloomFilters

ivory.bloomir.preprocessing.GenerateCompressedPostings

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.