Examples of ivory.core.RetrievalEnvironment

ivory.core.RetrievalEnvironment
@author Don Metzler @author Jimmy Lin

        fs = FileSystem.get(conf);
      } catch (IOException e) {
        throw new RuntimeException("Error opening the FileSystem!");
      }
      
      RetrievalEnvironment env;
      try {
        env = new RetrievalEnvironment(conf.get(Constants.IndexPath), fs);
      } catch (IOException e) {
        throw new RuntimeException("Unable to create RetrievalEnvironment!");
      }


      String termsFile = env.getIndexTermsData();
      String idsFile = env.getIndexTermIdsData();
      String idToTermFile = env.getIndexTermIdMappingData();


      String dfByTermFile = env.getDfByTermData();
      String cfByTermFile = env.getCfByTermData();
      String dfByIntFile = env.getDfByIntData();
      String cfByIntFile = env.getCfByIntData();


      nTerms = conf.getInt(Constants.CollectionTermCount, 0);
      window = conf.getInt(Constants.TermIndexWindow, 8);


      seqNums = new int[nTerms];

View Full Code Here

    if (!fs.exists(p)) {
      LOG.info("index directory doesn't exist, creating...");
      fs.mkdirs(p);
    }


    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);


    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();


    if (!fs.exists(mappingFile)) {
      LOG.info("docno-mapping.dat doesn't exist, creating...");
      String[] arr = new String[] { collection, mappingDir.toString(),
          mappingFile.toString(), "100" };

View Full Code Here

    LOG.info(" - Index path: " + indexPath);
    LOG.info(" - Collections: " + collection);


    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);


    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
      LOG.error("Error: index path doesn't exist!");
      return 0;
    }


    if (!fs.exists(env.getDocnoMappingData())) {
      LOG.error("Error: docno mapping data doesn't exist!");
      return 0;
    }


    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);


    conf.set(Constants.CollectionName, "ClueWeb:English");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, org.apache.hadoop.mapred.SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, ivory.core.tokenize.GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());


    conf.setInt(Constants.DocnoOffset, 0);
    conf.setInt(Constants.MinDf, 50);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

View Full Code Here

    LOG.info(" - Index path: " + indexPath);
    LOG.info(" - segement number: " + segment);


    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);


    Path p = new Path(indexPath);
    if (!fs.exists(p)) {
      LOG.error("Error: index path doesn't exist!");
      return 0;
    }


    if (!fs.exists(env.getDocnoMappingData())) {
      LOG.error("Error: docno mapping data doesn't exist!");
      return 0;
    }


    conf.set(Constants.CollectionName, "ClueWeb:English:Segment" + segment);
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, ClueWarcDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());


    conf.setInt(Constants.DocnoOffset, DocnoOffsets[segment]);
    conf.setInt(Constants.MinDf, 10);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

View Full Code Here

    if (!fs.exists(p)) {
      LOG.info("index directory doesn't exist, creating...");
      fs.mkdirs(p);
    }


    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);


    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();


    if (!fs.exists(mappingFile)) {
      LOG.info("docno-mapping.dat doesn't exist, creating...");
      String[] arr = new String[] { collection, mappingDir.toString(),
          mappingFile.toString(), "100" };

View Full Code Here

    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
      LOG.info("Index path doesn't exist, creating...");
      fs.mkdirs(p);
    }
    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);


    // Build docno mapping from raw collection
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
      LOG.info(mappingFile + " doesn't exist, creating...");
      String[] arr = new String[] {
          "-input=" + rawCollection,
          "-output_path=" + indexRootPath + "/wiki-docid-tmp",
          "-output_file=" + mappingFile.toString(),
          "-wiki_language=" + collectionLang };
      LOG.info("Running BuildWikipediaDocnoMapping with args " + Arrays.toString(arr));


      BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
      tool.setConf(conf);
      tool.run(arr);


      fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    } else {
      LOG.info("Docno mapping already exists at: " + mappingFile);
    }


    // Repack Wikipedia into sequential compressed block
    if (!fs.exists(new Path(seqCollection + "/part-00000"))) {
      LOG.info(seqCollection + " doesn't exist, creating...");
      String[] arr = new String[] { "-input=" + rawCollection,
          "-output=" + seqCollection,
          "-mapping_file=" + mappingFile.toString(),
          "-compression_type=block",
          "-wiki_language=" + collectionLang };
      LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));


      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    } else {
      LOG.info("Repacked collection already exists at: " + seqCollection);      
    }


    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName());
    conf.set(Constants.Tokenizer, tokenizerClass);      //"ivory.tokenize.OpenNLPTokenizer"
    conf.setInt(Constants.MinDf, MinDF);
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.TermIndexWindow, TermIndexWindow);


    // Builds term doc vectors from document collection, and filters the terms that are not included
    // in Ivory.SrcVocab.
    long startTime = System.currentTimeMillis();
    long preprocessStartTime = System.currentTimeMillis();
    LOG.info("Building term doc vectors...");
    int exitCode = new BuildTermDocVectors(conf).run();
    if (exitCode >= 0) {
      LOG.info("Job BuildTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    }else {
      LOG.info("Error: BuildTermDocVectors. Terminating...");
      return -1;
    }


    // Get CF and DF counts.
    startTime = System.currentTimeMillis();
    LOG.info("Counting terms...");
    exitCode = new ComputeGlobalTermStatistics(conf).run();
    LOG.info("TermCount = " + env.readCollectionTermCount());
    if (exitCode >= 0) {
      LOG.info("Job ComputeGlobalTermStatistics finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    }else {
      LOG.info("Error: ComputeGlobalTermStatistics. Terminating...");
      return -1;
    }
    // Build a map from terms to sequentially generated integer term ids.
    startTime = System.currentTimeMillis();
    LOG.info("Building term-to-integer id mapping...");
    exitCode = new BuildDictionary(conf).run();
    if (exitCode >= 0) {
      LOG.info("Job BuildDictionary finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    }else{
      LOG.info("Error: BuildDictionary. Terminating...");
      return -1;
    }


    // Compute term weights, and output weighted term doc vectors.
    LOG.info("Building weighted term doc vectors...");
    startTime = System.currentTimeMillis();


    conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    conf.setInt("Ivory.MinNumTerms",MinNumTermsPerArticle);


    if (mode == CROSS_LINGUAL_F) {
      // Translate term doc vectors into English.
      exitCode = new BuildTranslatedTermDocVectors(conf).run();
    } else {
      // Build weighted term doc vectors.
      exitCode = new BuildWeightedTermDocVectors(conf).run();
    }
    if (exitCode >= 0) {
      LOG.info("Job BuildTranslated/WeightedTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    }else {
      LOG.info("Error: BuildTranslated/WeightedTermDocVectors. Terminating...");
      return -1;
    }


    // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted integer doc vectors...");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    if (mode == MONO_LINGUAL) {
      exitCode = new BuildIntDocVectors(conf).run();
      exitCode = new BuildWeightedIntDocVectors(conf).run();
      if (exitCode >= 0) {
        LOG.info("Job BuildWeightedIntDocVectors finished in "+(System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
      }else {
        LOG.info("Error: BuildWeightedIntDocVectors. Terminating...");
        return -1;
      }
    } else {
      BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool =
        new BuildTargetLangWeightedIntDocVectors(conf);


      int finalNumDocs = weightedIntVectorsTool.run();


      LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " +
          (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
      if (finalNumDocs > 0) {
        LOG.info("Changed doc count: " + env.readCollectionDocumentCount() +" => " + finalNumDocs);
        env.writeCollectionDocumentCount(finalNumDocs);
      }else {
        LOG.info("No document output! Terminating...");
        return -1;
      }
      // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
      Vocab engVocabH = null;
      try {
        engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
      } catch (IOException e) {
        e.printStackTrace();
      }  
      LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size());
      env.writeCollectionTermCount(engVocabH.size());
    }


    LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds");


    return 0;

View Full Code Here

    String documentVectorClass = options.getOptionValue(OptionManager.DOCUMENT_VECTOR_CLASS);
    String outputPath = options.getOptionValue(OptionManager.OUTPUT_PATH);
    String qrelPath = options.getOptionValue(OptionManager.JUDGMENT_PATH);


    FileSystem fs = FileSystem.get(new Configuration());
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    env.initialize(true);


    //Parse queries, judgemnts and features
    HMapIV<int[]> qrels = QrelUtility.parseQrelsFromTabDelimited(qrelPath);


    FSDataOutputStream output = fs.create(new Path(outputPath));
    Set<Integer> docidHistory = Sets.newHashSet();


    //Evaluate queries and/or write the results to an output file
    for(int qid: qrels.keySet()) {
      for(int docid: qrels.get(qid)) {
        if(!docidHistory.contains(docid)) {
          docidHistory.add(docid);


          IntDocVector vector = env.documentVectors(new int[]{docid})[0];
          output.writeInt(docid);
          DocumentVectorUtility.newInstance(documentVectorClass, vector).write(output);
        }
      }
      LOGGER.info("Compressed query " + qid);

View Full Code Here

    String qrelPath = options.getOptionValue(OptionManager.JUDGMENT_PATH);
    String featurePath = options.getOptionValue(OptionManager.FEATURE_PATH);
    boolean writeOutput = options.foundOption(OptionManager.OUTPUT_PATH);


    FileSystem fs = FileSystem.get(new Configuration());
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    env.initialize(true);


    DocumentVectorSlidingWindow generator = new DocumentVectorSlidingWindow(env, fs);


    //Parse queries, judgemnts and features
    HMapIV<String> parsedQueries = QueryUtility.loadQueries(queryPath);

View Full Code Here

    String qrelPath = options.getOptionValue(OptionManager.JUDGMENT_PATH);
    String featurePath = options.getOptionValue(OptionManager.FEATURE_PATH);
    boolean writeOutput = options.foundOption(OptionManager.OUTPUT_PATH);


    FileSystem fs = FileSystem.get(new Configuration());
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    env.initialize(true);


    DocumentVectorOnTheFlyIndexing generator = new DocumentVectorOnTheFlyIndexing(env, fs);


    //Parse queries, judgemnts and features
    HMapIV<String> parsedQueries = QueryUtility.loadQueries(queryPath);

View Full Code Here

    if(options.foundOption(OptionManager.HITS)) {
      hits = Integer.parseInt(options.getOptionValue(OptionManager.HITS));
    }


    FileSystem fs = FileSystem.get(new Configuration());
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    env.initialize(true);


    RankAndFeaturesSmallAdaptive generator = new RankAndFeaturesSmallAdaptive(env, fs);


    //Parse queries and find integer codes for the query terms.
    HMapIV<String> parsedQueries = QueryUtility.loadQueries(queryPath);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of ivory.core.RetrievalEnvironment

edu.umd.cloud9.collection.DocnoMapping

ivory.app.PreprocessClueWebEnglish

ivory.app.PreprocessClueWebEnglishMultipleSegments

ivory.app.PreprocessCollection

ivory.app.PreprocessMedline

ivory.app.PreprocessTextCollection

ivory.app.PreprocessTrecForeign

ivory.app.PreprocessWikipedia

ivory.bloomir.preprocessing.GenerateBloomFilters

ivory.bloomir.preprocessing.GenerateCompressedPostings

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.