Package ivory.integration.clir

Source Code of ivory.integration.clir.VerifyNtcirChinesePositionalIndexIP

package ivory.integration.clir;

import static org.junit.Assert.assertTrue;
import ivory.core.eval.Qrels;
import ivory.core.tokenize.StanfordChineseTokenizer;
import ivory.integration.IntegrationUtils;
import ivory.sqe.retrieval.QueryEngine;
import ivory.sqe.retrieval.RunQueryEngine;
import java.util.List;
import junit.framework.JUnit4TestAdapter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Test;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;

public class VerifyNtcirChinesePositionalIndexIP {

  private Path collectionPath = new Path("/shared/collections/clir/ntcir/gigaword-xin.2002-06.zh-cleaned.xml");
  private String index = this.getClass().getCanonicalName() + "-index";
  private static String PATH = "en-zh.ntcir8";
  private static String LANGUAGE = "zh";
  private static String MTMODEL = "cdec";
  private static int numTopics = 100;

  @Test
  public void runBuildIndex() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    assertTrue(fs.exists(collectionPath));
   
    fs.delete(new Path(index), true);
   
    List<String> jars = Lists.newArrayList();
    jars.add(IntegrationUtils.getJar("lib", "cloud9"));
    jars.add(IntegrationUtils.getJar("lib", "guava"));
    jars.add(IntegrationUtils.getJar("lib", "dsiutils"));
    jars.add(IntegrationUtils.getJar("lib", "fastutil"));
    jars.add(IntegrationUtils.getJar("lib", "jsap"));
    jars.add(IntegrationUtils.getJar("lib", "sux4j"));
    jars.add(IntegrationUtils.getJar("lib", "commons-collections"));
    jars.add(IntegrationUtils.getJar("lib", "lucene-analyzers"));
    jars.add(IntegrationUtils.getJar("lib", "lucene-core"));
    jars.add(IntegrationUtils.getJar("lib", "tools"));
    jars.add(IntegrationUtils.getJar("lib", "maxent"));
    jars.add(IntegrationUtils.getJar("lib", "commons-lang"));
    jars.add(IntegrationUtils.getJar("lib", "commons-cli"));
    jars.add(IntegrationUtils.getJar("lib", "bliki-core"));
    jars.add(IntegrationUtils.getJar("lib", "stanford-chinese-segmenter"));
    jars.add(IntegrationUtils.getJar("dist", "ivory"));

    String libjars = String.format("-libjars=%s", Joiner.on(",").join(jars));

    // Done with indexing, now do retrieval run.
    fs.copyFromLocalFile(false, true, new Path("data/vocab/vocab.en-" + LANGUAGE + ".en"),
        new Path(index + "/vocab.en-" + LANGUAGE + ".en"));
    fs.copyFromLocalFile(false, true, new Path("data/vocab/vocab.en-" + LANGUAGE + "." + LANGUAGE + ""),
        new Path(index + "/vocab.en-" + LANGUAGE + "." + LANGUAGE + ""));
    fs.copyFromLocalFile(false, true, new Path("data/vocab/ttable.en-" + LANGUAGE + ""),
        new Path(index + "/ttable.en-" + LANGUAGE + ""));
    fs.copyFromLocalFile(false, true, new Path("data/tokenizer/" + LANGUAGE + "-token.bin"),
        new Path(index + "/" + LANGUAGE + "-token.bin"));
    fs.copyFromLocalFile(false, true, new Path("data/tokenizer/en-token.bin"),
        new Path(index + "/en-token.bin"));
    fs.copyFromLocalFile(false, true, new Path("data/tokenizer/" + LANGUAGE + ".stop.stemmed"),
        new Path(index + "/" + LANGUAGE + ".stop.stemmed"));
    fs.copyFromLocalFile(false, true, new Path("data/tokenizer/en.stop.stemmed"),
        new Path(index + "/en.stop.stemmed"));
    for (int i = 0; i < numTopics; i++) {
      fs.copyFromLocalFile(false, true, new Path("data/" + PATH + "/" + MTMODEL + ".grammar/grammar." + i),
          new Path(index + "/grammar." + i));
    }
    fs.copyFromLocalFile(false, true, new Path("data/" + PATH + "/" + MTMODEL + "/title_en-"
        + LANGUAGE + "-trans10-filtered-integration.xml"),
        new Path(index + "/title_en-" + LANGUAGE + "-trans10-filtered.xml"));

    String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "ivory"),
        ivory.app.PreprocessTrecForeign.class.getCanonicalName(), libjars,
        "-input=" + collectionPath.toString(), "-index=" + index,
        "-lang=" + LANGUAGE , "-tokenizerclass=" + StanfordChineseTokenizer.class.getCanonicalName(),
        "-tokenizermodel=" + index + "/" + LANGUAGE + "-token.bin", "-name="
        + VerifyNtcirChinesePositionalIndexIP.class.getCanonicalName()};

    IntegrationUtils.exec(Joiner.on(" ").join(args));

    args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "ivory"),
        ivory.app.BuildIndex.class.getCanonicalName(), libjars,
        "-index=" + index, "-indexPartitions=10", "-positionalIndexIP" };

    IntegrationUtils.exec(Joiner.on(" ").join(args));

    QueryEngine qr = new QueryEngine();

    for (int heuristic=0; heuristic<=2; heuristic++) {
      conf = RunQueryEngine.parseArgs(new String[] {
          "-index=" + index,
          "-queries_path=" + index + "/title_en-" + LANGUAGE + "-trans10-filtered.xml",
          "-run=en-" + LANGUAGE + ".interp",
          "-query_type=mtN",
          "-doc_lang=" + LANGUAGE + "",
          "-query_lang=en",
          "-doc_tokenizer=" + index + "/" + LANGUAGE + "-token.bin",
          "-query_tokenizer=" + index + "/en-token.bin",
          "-query_vocab=" + index + "/vocab.en-" + LANGUAGE + ".en",
          "-doc_vocab=" + index + "/vocab.en-" + LANGUAGE + "." + LANGUAGE + "",
          "-f2eProbs=" + index + "/ttable.en-" + LANGUAGE + "",
          "-LexProbThreshold=0.005",
          "-CumProbThreshold=0.95"
          "-mt_weight=0.3",
          "-grammar_weight=0.4",
          "-bitext_weight=0.3",
          "-token_weight=1",
          "-phrase_weight=0",
          "-kBest=10",
          "-doc_stemmed_stopwordlist=" + index + "/" + LANGUAGE + ".stop.stemmed",
          "-query_stemmed_stopwordlist=" + index + "/en.stop.stemmed",
          "--one2many=" + heuristic, "--is_stemming", "--is_doc_stemmed"
      }, fs, conf);

      long start = System.currentTimeMillis();
      qr.init(conf, fs);
      qr.runQueries(conf);
      long end = System.currentTimeMillis();

      System.err.println("Total query time for heuristic " + heuristic + ":" + (end - start) + "ms");
    }
    ivory.regression.sigir2013.cdec.EnZh_NTCIR8.initialize();
    ivory.regression.sigir2013.cdec.EnZh_NTCIR8.verifyAllResults(qr.getModels(), qr.getAllResults(), qr.getDocnoMapping(),
        new Qrels("data/" + PATH + "/qrels." + PATH+ ".txt"));

    System.err.println("Done!");
  }

  public static junit.framework.Test suite() {
    return new JUnit4TestAdapter(VerifyNtcirChinesePositionalIndexIP.class);
  }
}
TOP

Related Classes of ivory.integration.clir.VerifyNtcirChinesePositionalIndexIP

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.