Source Code of ivory.core.driver.PreprocessWikipedia

/*
 * Ivory: A Hadoop toolkit for Web-scale information retrieval
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */


package ivory.core.driver;




import ivory.core.RetrievalEnvironment;
import ivory.core.preprocess.BuildIntDocVectors;
import ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors;
import ivory.core.preprocess.BuildTermDocVectors;
import ivory.core.preprocess.BuildDictionary;
import ivory.core.preprocess.BuildTranslatedTermDocVectors;
import ivory.core.preprocess.BuildWeightedIntDocVectors;
import ivory.core.preprocess.BuildWeightedTermDocVectors;
import ivory.core.preprocess.ComputeGlobalTermStatistics;


import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;


import edu.umd.cloud9.collection.wikipedia.BuildWikipediaDocnoMapping;
import edu.umd.cloud9.collection.wikipedia.RepackWikipedia;
import edu.umd.hooka.Vocab;
import edu.umd.hooka.alignment.HadoopAlign;


/**
 * Driver class that preprocesses a Wikipedia collection in any language. 
 * 
 * @author ferhanture
 *
 */
public class PreprocessWikipedia extends Configured implements Tool {
  private static final Logger LOG = Logger.getLogger(PreprocessWikipedia.class);


  /*
   * DEFINED PARAMETERS HERE:
   */
  static final int MinDF = 2, MinNumTermsPerArticle = 5, TermIndexWindow = 8;
  static final boolean IsNormalized = true;
  static final  int MONO_LINGUAL = 4, CROSS_LINGUAL_E = 7, CROSS_LINGUAL_F =12;


  private static int printUsage() {
    System.out.println("\nThis program can be run in three different \"modes\":\n=====================\nInput: English Wikipedia collection\nOutput: English weighted document vectors" +
        "\nusage: [index-path] [raw-path] [compressed-path] [tokenizer-class]" +
        "\n\nInput: English side of cross-lingual Wikipedia collection\nOutput: English weighted document vectors (comparable with the document vectors generated from non-English side)" +
        "\nusage: [index-path] [raw-path] [compressed-path] [tokenizer-class] [collection-lang] [tokenizer-model] [collection-vocab]" +
        "\n\nInput: Non-English side of cross-lingual Wikipedia collection\nOutput: English weighted document vectors (comparable with the document vectors generated from English side)" +
    "\nusage: [index-path] [raw-path] [compressed-path] [tokenizer-class] [collection-lang] [tokenizer-model] [src-vocab_f] [trg-vocab_e] [prob-table_f-->e] [src-vocab_e] [trg-vocab_f] [prob-table_e-->f])");
    return -1;
  }


  /**
   * Runs this tool.
   */
  public int run(String[] args) throws Exception {
    int mode = args.length;
    if (mode != MONO_LINGUAL && mode != CROSS_LINGUAL_E && mode != CROSS_LINGUAL_F){
      printUsage();
      return -1;
    }


    String indexRootPath = args[0];
    String rawCollection = args[1];   //"/shared/Wikipedia/raw/dewiki-20100117-pages-articles.xml";
    String seqCollection = args[2];   //"/umd-lin/fture/pwsim/de-wikipedia/compressed.block/de-20100117";
    String tokenizerClass = args[3];  


    Configuration conf = new Configuration();


    String collectionLang = null, tokenizerModel = null, collectionVocab = null;
    String fVocab_f2e = null, eVocab_f2e = null, fVocab_e2f, eVocab_e2f = null, ttable_f2e = null, ttable_e2f = null;
    if(mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F){    // CROSS-LINGUAL CASE
      collectionLang = args[4];
      tokenizerModel = args[5];
      collectionVocab = args[6];
      conf.set("Ivory.Lang", collectionLang);
      conf.set("Ivory.TokenizerModel", tokenizerModel);
      conf.set("Ivory.CollectionVocab", collectionVocab);
      conf.set("Ivory.FinalVocab", collectionVocab);


      if(mode == CROSS_LINGUAL_F){      // non-English side, needs to be translated
        fVocab_f2e = args[6];    //  same as collection vocab
        eVocab_f2e = args[7];
        ttable_f2e = args[8];
        eVocab_e2f = args[9];
        fVocab_e2f = args[10];
        ttable_e2f = args[11];


        conf.set("Ivory.F_Vocab_F2E", fVocab_f2e);  
        conf.set("Ivory.E_Vocab_F2E", eVocab_f2e);
        conf.set("Ivory.TTable_F2E", ttable_f2e);
        conf.set("Ivory.E_Vocab_E2F", eVocab_e2f);  
        conf.set("Ivory.F_Vocab_E2F", fVocab_e2f);  
        conf.set("Ivory.TTable_E2F", ttable_e2f);


        conf.set("Ivory.FinalVocab", eVocab_e2f);
      }
    }


    int numMappers = 100;
    int numReducers = 100;


    LOG.info("Tool name: WikipediaDriver");
    LOG.info(" - Index path: " + indexRootPath);
    LOG.info(" - Raw collection path: " + rawCollection);
    LOG.info(" - Compressed collection path: " + seqCollection);
    LOG.info(" - Tokenizer class: " + tokenizerClass);
    LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle);


    if(mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F){
      LOG.info("Cross-lingual collection : Preprocessing "+collectionLang+" side.");
      LOG.info(" - Collection vocab file: " + collectionVocab);
      LOG.info(" - Tokenizer model: " + tokenizerModel);


      if(mode == CROSS_LINGUAL_F){
        LOG.info(" - TTable file "+collectionLang+" --> English : " + ttable_f2e);
        LOG.info(" - Source vocab file: " + fVocab_f2e);
        LOG.info(" - Target vocab file: " + eVocab_f2e);
        LOG.info(" - TTable file "+"English --> "+collectionLang+" : " + ttable_e2f);
        LOG.info(" - Source vocab file: " + fVocab_f2e);
        LOG.info(" - Target vocab file: " + eVocab_f2e);
      }
    }
    LOG.info("Launching with " + numMappers + " mappers, " + numReducers + " reducers...");


    FileSystem fs = FileSystem.get(conf);


    Path p = new Path(indexRootPath);
    if (!fs.exists(p)) {
      LOG.info("Index path doesn't exist, creating...");
      fs.mkdirs(p);
    }
    RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);


    // Build docno mapping from raw collection
    Path mappingFile = env.getDocnoMappingData();
    if (!fs.exists(mappingFile)) {
      LOG.info(mappingFile + " doesn't exist, creating...");
      String[] arr = new String[] { rawCollection, indexRootPath + "/wiki-docid-tmp",
          mappingFile.toString(), new Integer(numMappers).toString() };
      BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
      tool.setConf(conf);
      tool.run(arr);


      fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    }else{
      LOG.info(p+" exists");
    }


    // Repack Wikipedia into sequential compressed block
    p = new Path(seqCollection);
    if (!fs.exists(p)) {
      LOG.info(seqCollection + " doesn't exist, creating...");
      String[] arr = new String[] { rawCollection, seqCollection, mappingFile.toString(), "block"};
      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    }


    conf.set("Ivory.CollectionName", "Wikipedia-"+collectionLang);
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);
    conf.set("Ivory.CollectionPath", seqCollection);
    conf.set("Ivory.IndexPath", indexRootPath);
    conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat");
    conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMapping");
    conf.set("Ivory.Tokenizer", tokenizerClass);      //"ivory.tokenize.OpenNLPTokenizer"
    conf.setInt("Ivory.MinDf", MinDF);
    conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);


    // Builds term doc vectors from document collection, and filters the terms that are not included in Ivory.SrcVocab
    long startTime = System.currentTimeMillis();  
    long preprocessStartTime = System.currentTimeMillis();  
    LOG.info("Building term doc vectors...");
    BuildTermDocVectors termDocVectorsTool = new BuildTermDocVectors(conf);
    termDocVectorsTool.run();
    LOG.info("Job finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


    // Get CF and DF counts
    startTime = System.currentTimeMillis();
    LOG.info("Counting terms...");
    ComputeGlobalTermStatistics termCountWithDfAndCfTool = new ComputeGlobalTermStatistics(conf);
    termCountWithDfAndCfTool.run();
    LOG.info("TermCount = "+env.readCollectionTermCount()+"\nJob finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


    // Build a map from terms to sequentially generated integer term ids
    startTime = System.currentTimeMillis();
    conf.setInt("Ivory.TermIndexWindow", TermIndexWindow);
    LOG.info("Building term-to-integer id mapping...");
    BuildDictionary termIDsDfCfTool = new BuildDictionary(conf);
    termIDsDfCfTool.run();
    LOG.info("Job finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


    // Compute term weights, and output weighted term doc vectors
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted term doc vectors...");
    conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
    if(mode == CROSS_LINGUAL_F){
      conf.setInt("Ivory.MinNumTerms",MinNumTermsPerArticle);


      // translate term doc vectors into English. 
      conf.setBoolean("Ivory.Normalize", false);
      BuildTranslatedTermDocVectors weightedTermVectorsTool = new BuildTranslatedTermDocVectors(conf);
      weightedTermVectorsTool.run();
    }else{            
      conf.setInt("Ivory.MinNumTerms",MinNumTermsPerArticle);


      // get weighted term doc vectors
      conf.setBoolean("Ivory.Normalize", false);
      BuildWeightedTermDocVectors weightedTermVectorsTool = new BuildWeightedTermDocVectors(conf);
      weightedTermVectorsTool.run();
    }
    LOG.info("Job finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


    // normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
    startTime = System.currentTimeMillis();
    LOG.info("Building weighted integer doc vectors...");
    conf.setBoolean("Ivory.Normalize", IsNormalized);
    if(mode == MONO_LINGUAL){
      new BuildIntDocVectors(conf).run();
      new BuildWeightedIntDocVectors(conf).run();
      LOG.info("Job BuildWeightedIntDocVectors finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");
    }else{
      BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(conf);
      LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


      int finalNumDocs = weightedIntVectorsTool.run();
      if(finalNumDocs > 0){
        LOG.info("Changed doc count from "+env.readCollectionDocumentCount() + " to = "+finalNumDocs);
        env.writeCollectionDocumentCount(finalNumDocs);
      }
      // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
      Vocab engVocabH = null;
      try {
        engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
      } catch (IOException e) {
        e.printStackTrace();
      }  
      LOG.info("Changed term count to : "+env.readCollectionTermCount() + " = " + engVocabH.size());
      env.writeCollectionTermCount(engVocabH.size());
    }
    
    LOG.info("Preprocessing job finished in "+(System.currentTimeMillis()-preprocessStartTime)/1000.0+" seconds");


    return 0;
  }


  /**
   * Dispatches command-line arguments to the tool via the
   * <code>ToolRunner</code>.
   */
  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new Configuration(), new PreprocessWikipedia(), args);
    System.exit(res);
  }
}
Source Code of ivory.core.driver.PreprocessWikipedia

Related Classes of ivory.core.driver.PreprocessWikipedia