Examples of ivory.core.RetrievalEnvironment

ivory.core.RetrievalEnvironment
@author Don Metzler @author Jimmy Lin

    LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat));
    LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer));
    LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass));
    LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset));


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();


    if (!fs.exists(mappingFile)) {
      LOG.error("Error, docno mapping data file " + mappingFile + "doesn't exist!");
      return 0;
    }


    DistributedCache.addCacheFile(mappingFile.toUri(), conf);


    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
      LOG.info("TermDocVectors already exist: Skipping!");
      return 0;
    }


    env.writeCollectionName(collectionName);
    env.writeCollectionPath(collectionPath);
    env.writeInputFormat(inputFormat);
    env.writeDocnoMappingClass(mappingClass);
    env.writeTokenizerClass(tokenizer);
    env.writeDocnoOffset(docnoOffset);


    Job job1 = new Job(conf,
        BuildTermDocVectors.class.getSimpleName() + ":" + collectionName);
    job1.setJarByClass(BuildTermDocVectors.class);


    job1.setNumReduceTasks(0);


    FileInputFormat.addInputPaths(job1, collectionPath);
    FileOutputFormat.setOutputPath(job1, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD);


    job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);


    job1.setMapOutputKeyClass(IntWritable.class);
    job1.setMapOutputValueClass(LazyTermDocVector.class);
    job1.setOutputKeyClass(IntWritable.class);
    job1.setOutputValueClass(LazyTermDocVector.class);


    job1.setMapperClass(MyMapper.class);


    long startTime = System.currentTimeMillis();
    job1.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");


    // Write out number of postings.
    int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue();
    env.writeCollectionDocumentCount(collectionDocCount);


    Path dlFile = env.getDoclengthsData();
    if (fs.exists(dlFile)) {
      LOG.info("DocLength data exists: Skipping!");
      return 0;
    }


    conf.setInt(Constants.CollectionDocumentCount, collectionDocCount);
    conf.set(InputPath, env.getDoclengthsDirectory().toString());
    conf.set(DocLengthDataFile, dlFile.toString());


    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);


    LOG.info("Writing doc length data to " + dlFile + "...");


    Job job2 = new Job(conf, "DocLengthTable:" + collectionName);
    job2.setJarByClass(BuildTermDocVectors.class);


    job2.setNumReduceTasks(0);
    job2.setInputFormatClass(NullInputFormat.class);
    job2.setOutputFormatClass(NullOutputFormat.class);
    job2.setMapperClass(DocLengthDataWriterMapper.class);


    startTime = System.currentTimeMillis();
    job2.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");


    long collectionSumOfDocLengths =
        job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue();
    env.writeCollectionAverageDocumentLength(
        (float) collectionSumOfDocLengths / collectionDocCount);


    return 0;
  }

View Full Code Here

    if (!fs.exists(p)) {
      sLogger.info("index path doesn't exist, creating...");
      fs.mkdirs(p);
    }


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);


    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
      sLogger.info(mappingFile + " doesn't exist, creating...");
      String[] arr = new String[] { collection, indexPath + "/medline-docid-tmp",  mappingFile.toString(), new Integer(numMappers).toString() };
      NumberMedlineCitations tool = new NumberMedlineCitations();
      tool.setConf(conf);

View Full Code Here

    FileSystem fs = FileSystem.get(conf);


    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    String indexPath = conf.get(Constants.IndexPath);


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();


    LOG.info("Tool: " + BuildIntPostingsForwardIndex.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));


    Job job = new Job(getConf(),
        BuildIntPostingsForwardIndex.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIntPostingsForwardIndex.class);


    Path inputPath = new Path(env.getPostingsDirectory());
    FileInputFormat.setInputPaths(job, inputPath);


    Path postingsIndexPath = new Path(env.getPostingsIndexData());


    if (fs.exists(postingsIndexPath)) {
      LOG.info("Postings forward index path already exists!");
      return 0;
    }

View Full Code Here

        throw new RuntimeException("Error opening the FileSystem!");
      }


      String indexPath = conf.get(Constants.IndexPath);


      RetrievalEnvironment env = null;
      try {
        env = new RetrievalEnvironment(indexPath, fs);
      } catch (IOException e) {
        throw new RuntimeException("Unable to create RetrievalEnvironment!");
      }


      positionsFile = env.getPostingsIndexData();
      collectionTermCount = env.readCollectionTermCount();


      LOG.info("Ivory.PostingsPositionsFile: " + positionsFile);
      LOG.info("Ivory.CollectionTermCount: " + collectionTermCount);


      try {

View Full Code Here

    if (!fs.exists(p)) {
      LOG.info("index directory doesn't exist, creating...");
      fs.mkdirs(p);
    }


    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);


    // Look for the docno mapping, which maps from docid (String) to docno
    // (sequentially-number integer). If it doesn't exist create it.
    Path mappingFile = env.getDocnoMappingData();
    Path mappingDir = env.getDocnoMappingDirectory();


    if (!fs.exists(mappingFile)) {
      LOG.info("docno-mapping.dat doesn't exist, creating...");
      String[] arr = new String[] { collection, mappingDir.toString(),
              mappingFile.toString() };
      NumberTrecDocuments2 tool = new NumberTrecDocuments2();
      tool.setConf(conf);
      tool.run(arr);


      fs.delete(mappingDir, true);
    }


    conf.set(Constants.CollectionName, "TREC_vol45");
    conf.set(Constants.CollectionPath, collection);
    conf.set(Constants.IndexPath, indexRootPath);
    conf.set(Constants.InputFormat, TrecDocumentInputFormat2.class.getCanonicalName());
    conf.set(Constants.Tokenizer, GalagoTokenizer.class.getCanonicalName());
    conf.set(Constants.DocnoMappingClass, TrecDocnoMapping.class.getCanonicalName());
    conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());


    conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
    conf.setInt(Constants.MinDf, 2); // toss away singleton terms
    conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
    conf.setInt(Constants.TermIndexWindow, 8);

View Full Code Here

  public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);


    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);


    String collectionName = env.readCollectionName();


    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCnt = env.readCollectionDocumentCount();


    LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
    LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));


    if (!fs.exists(new Path(indexPath))) {
      fs.mkdirs(new Path(indexPath));
    }


    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());


    if (fs.exists(postingsPath)) {
      LOG.info("Postings already exist: no indexing will be performed.");
      return 0;
    }


    conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt);


    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");


    Job job = new Job(conf,
        BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIPInvertedIndexDocSorted.class);


    job.setNumReduceTasks(reduceTasks);


    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, postingsPath);


    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);


    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(TermPositions.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PostingsListDocSortedPositional.class);


    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);


    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");


    env.writePostingsType(PostingsListDocSortedPositional.class.getCanonicalName());


    return 0;
  }

View Full Code Here

    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
      LOG.info("Index path doesn't exist, creating...");
      fs.mkdirs(p);
    }
    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);


    // Build docno mapping from raw collection
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
      LOG.info(mappingFile + " doesn't exist, creating...");
      String[] arr = new String[] { rawCollection, indexRootPath + "/wiki-docid-tmp",
          mappingFile.toString(), new Integer(numMappers).toString() };
      BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
      tool.setConf(conf);
      tool.run(arr);


      fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    }else{
      LOG.info(p+" exists");
    }


    // Repack Wikipedia into sequential compressed block
    p = new Path(seqCollection);
    if (!fs.exists(p)) {
      LOG.info(seqCollection + " doesn't exist, creating...");
      String[] arr = new String[] { rawCollection, seqCollection, mappingFile.toString(), "block"};
      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    }


    conf.set("Ivory.CollectionName", "Wikipedia-"+collectionLang);
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);
    conf.set("Ivory.CollectionPath", seqCollection);
    conf.set("Ivory.IndexPath", indexRootPath);
    conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat");
    conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMapping");
    conf.set("Ivory.Tokenizer", tokenizerClass);      //"ivory.tokenize.OpenNLPTokenizer"
    conf.setInt("Ivory.MinDf", MinDF);
    conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);


    // Builds term doc vectors from document collection, and filters the terms that are not included in Ivory.SrcVocab
    long startTime = System.currentTimeMillis();  
    long preprocessStartTime = System.currentTimeMillis();  
    LOG.info("Building term doc vectors...");
    BuildTermDocVectors termDocVectorsTool = new BuildTermDocVectors(conf);
    termDocVectorsTool.run();
    LOG.info("Job finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


    // Get CF and DF counts
    startTime = System.currentTimeMillis();
    LOG.info("Counting terms...");
    GetTermCount termCountWithDfAndCfTool = new GetTermCount(conf);
    termCountWithDfAndCfTool.run();
    LOG.info("TermCount = "+env.readCollectionTermCount()+"\nJob finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


    // Build a map from terms to sequentially generated integer term ids
    startTime = System.currentTimeMillis();
    conf.setInt("Ivory.TermIndexWindow", TermIndexWindow);
    LOG.info("Building term-to-integer id mapping...");
    BuildTermIdMap termIDsDfCfTool = new BuildTermIdMap(conf);
    termIDsDfCfTool.run();
    LOG.info("Job finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


    // Compute term weights, and output weighted term doc vectors
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted term doc vectors...");
    conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
    if(mode == CROSS_LINGUAL_F){
      conf.setInt("Ivory.MinNumTerms",MinNumTermsPerArticle);


      // translate term doc vectors into English. 
      conf.setBoolean("Ivory.Normalize", false);
      BuildTranslatedTermDocVectors weightedTermVectorsTool = new BuildTranslatedTermDocVectors(conf);
      weightedTermVectorsTool.run();
    }else{            
      conf.setInt("Ivory.MinNumTerms",MinNumTermsPerArticle);


      // get weighted term doc vectors
      conf.setBoolean("Ivory.Normalize", false);
      BuildWeightedTermDocVectors weightedTermVectorsTool = new BuildWeightedTermDocVectors(conf);
      weightedTermVectorsTool.run();
    }
    LOG.info("Job finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


    // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted integer doc vectors...");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    if(mode == MONO_LINGUAL){
      new BuildIntDocVectors(conf).run();
      new BuildWeightedIntDocVectors(conf).run();
      LOG.info("Job BuildWeightedIntDocVectors finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");
    }else{
      BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(conf);
      LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


      int finalNumDocs = weightedIntVectorsTool.run();
      if(finalNumDocs > 0){
        LOG.info("Changed doc count from "+env.readCollectionDocumentCount() + " to = "+finalNumDocs);
        env.writeCollectionDocumentCount(finalNumDocs);
      }
      // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
      Vocab engVocabH = null;
      try {
        engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
      } catch (IOException e) {
        e.printStackTrace();
      }  
      LOG.info("Changed term count to : "+env.readCollectionTermCount() + " = " + engVocabH.size());
      env.writeCollectionTermCount(engVocabH.size());
    }
    
    LOG.info("Preprocessing job finished in "+(System.currentTimeMillis()-preprocessStartTime)/1000.0+" seconds");


    return 0;

View Full Code Here

    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 1);


    String indexPath = conf.get("Ivory.IndexPath");
    String statsPath = conf.get("Ivory.GlobalStatsPath");


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();


    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + reduceTasks);


    // back up old stats
    Path p1 = new Path(indexPath + "/property.CollectionDocumentCount");
    Path p2 = new Path(indexPath + "/property.CollectionDocumentCount.local");


    if (!fs.exists(p2)) {
      sLogger.info("preserving local " + p1.getName());
      fs.rename(p1, p2);
    }


    p1 = new Path(indexPath + "/property.CollectionAverageDocumentLength");
    p2 = new Path(indexPath + "/property.CollectionAverageDocumentLength.local");


    if (!fs.exists(p2)) {
      sLogger.info("preserving local " + p1.getName());
      fs.rename(p1, p2);
    }


    p1 = new Path(indexPath + "/property.CollectionLength");
    p2 = new Path(indexPath + "/property.CollectionLength.local");


    if (!fs.exists(p2)) {
      sLogger.info("preserving local " + p1.getName());
      fs.rename(p1, p2);
    }


    // distribute global stats
    RetrievalEnvironment genv = new RetrievalEnvironment(statsPath, fs);
    long collectionLength = genv.readCollectionLength();
    int docCount = genv.readCollectionDocumentCount();
    float avgdl = genv.readCollectionAverageDocumentLength();


    sLogger.info("writing global stats from all index segments: ");
    sLogger.info(" - CollectionLength: " + collectionLength);
    sLogger.info(" - CollectionDocumentCount: " + docCount);
    sLogger.info(" - AverageDocumentLength: " + avgdl);

View Full Code Here

        docMapping =
          (DocnoMapping) Class.forName(conf.get(Constants.DocnoMappingClass)).newInstance();


        // Take a different code path if we're in standalone mode.
        if (conf.get("mapred.job.tracker").equals("local")) {
          RetrievalEnvironment env = new RetrievalEnvironment(
              context.getConfiguration().get(Constants.IndexPath), localFs);
          docMapping.loadMapping(env.getDocnoMappingData(), localFs);
        } else {
          Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
          // Load the docid to docno mappings. Assume file 0.
          docMapping.loadMapping(localFiles[0], localFs);
        }

View Full Code Here


    LOG.info("PowerTool: BuildTermIdMap2");
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    if (!fs.exists(new Path(indexPath))) {
      LOG.error("index path doesn't existing: skipping!");
      return 0;
    }


    Path termsFilePath = new Path(env.getIndexTermsData());
    Path termIDsFilePath = new Path(env.getIndexTermIdsData());
    Path idToTermFilePath = new Path(env.getIndexTermIdMappingData());
    Path dfByTermFilePath = new Path(env.getDfByTermData());
    Path cfByTermFilePath = new Path(env.getCfByTermData());
    Path dfByIntFilePath = new Path(env.getDfByIntData());
    Path cfByIntFilePath = new Path(env.getCfByIntData());


    if (fs.exists(termsFilePath) || fs.exists(termIDsFilePath) || fs.exists(idToTermFilePath)
        || fs.exists(dfByTermFilePath) || fs.exists(cfByTermFilePath)
        || fs.exists(dfByIntFilePath) || fs.exists(cfByIntFilePath)) {
      LOG.info("term and term id data exist: skipping!");
      return 0;
    }


    conf.setInt(Constants.CollectionTermCount, (int) env.readCollectionTermCount());


    Path tmpPath = new Path(env.getTempDirectory());
    fs.delete(tmpPath, true);


    Job job = new Job(conf,
        BuildTermIdMap.class.getSimpleName() + ":" + collectionName);


    job.setJarByClass(BuildTermIdMap.class);
    job.setNumReduceTasks(1);


    FileInputFormat.setInputPaths(job, new Path(env.getTermDfCfDirectory()));
    FileOutputFormat.setOutputPath(job, tmpPath);


    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of ivory.core.RetrievalEnvironment

edu.umd.cloud9.collection.DocnoMapping

ivory.app.PreprocessClueWebEnglish

ivory.app.PreprocessClueWebEnglishMultipleSegments

ivory.app.PreprocessCollection

ivory.app.PreprocessMedline

ivory.app.PreprocessTextCollection

ivory.app.PreprocessTrecForeign

ivory.app.PreprocessWikipedia

ivory.bloomir.preprocessing.GenerateBloomFilters

ivory.bloomir.preprocessing.GenerateCompressedPostings

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.