Package edu.umd.cloud9.io.map

Examples of edu.umd.cloud9.io.map.HMapSIW


    String VOCABDIR = "/fs/clip-qa/ferhan/end2end-experiments/Ivory/data/vocab";    // /Users/ferhanture/Documents/workspace/ivory-github/Ivory/data/vocab
    String TOKENDIR = "/fs/clip-qa/ferhan/end2end-experiments/Ivory/data/tokenizer";   // "/Users/ferhanture/Documents/workspace/ivory-github/Ivory/data/tokenizer
    String DATADIR = "/fs/clip-qa/ferhan/cl-pwsim/pwsim-experiments-2013";    // /Users/ferhanture/edu/research_archive/data/de-en/eu-nc-wmt08
   
    BitextClassifierUtils dt = new BitextClassifierUtils();
    numSentencesPerDocE = new HMapSIW();
    numSentencesPerDocF = new HMapSIW();
    FileSystem localFs = FileSystem.getLocal(new Configuration());
    eVocabSrc = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.en-de.en"), localFs);
    eVocabTrg = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.de-en.en"), localFs);
    fVocabSrc = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.de-en.de"), localFs);
    fVocabTrg = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.en-de.de"), localFs);
View Full Code Here


    try {
      fis1 = new FileInputStream(eFile);
      fis2 = new FileInputStream(fFile);
      dis1 = new BufferedReader(new InputStreamReader(fis1, "UTF-8"));
      dis2 = new BufferedReader(new InputStreamReader(fis2, "UTF-8"));
      HMapSIW fSent = new HMapSIW();
      HMapSIW eSent = new HMapSIW();
      String eLine = null, fLine = null;
      int cntE = 0, cntF = 0, lastSentLenE = 0, lastSentLenF = 0;

      while ((eLine = dis1.readLine()) != null) {
        fLine = dis2.readLine().trim();
        eLine = eLine.trim();

        String[] tokens;
        if (fTokenizer == null) {
          tokens = fLine.split(" ");
        } else {
          tokens = fTokenizer.processContent(fLine);
        }
        lastSentLenF = tokens.length;

        for (String token : tokens) {
          if (!fSent.containsKey(token)) { // if this is first time we saw token in this sentence
            dfD.increment(token);
          }
          fSent.increment(token);

        }

        tokens = eTokenizer.processContent(eLine);
        lastSentLenE = tokens.length;

        for (String token : tokens) {
          if (!eSent.containsKey(token)) {
            dfE.increment(token);
          }
          eSent.increment(token);
        }

        sumFLengs += lastSentLenF;
        sumELengs += lastSentLenE;

        enSentLengths.add(lastSentLenE);
        deSentLengths.add(lastSentLenF);

        eSentTfs.add(eSent);
        fSentTfs.add(fSent);

        eSents.add(eLine);
        fSents.add(fLine);

        cntE++;
        cntF++;
        fSent = new HMapSIW();
        eSent = new HMapSIW();
      }

      // dispose all the resources after using them.
      fis1.close();
      dis1.close();
View Full Code Here

    String label;
    long time = System.currentTimeMillis();

    for (int i = 0; i < transVectors.size(); i++) {
      HMapSFW transVector = transVectors.get(i);
      HMapSIW fTfMap = fTfs.get(i);
      String fSent = fSents.get(i);
      for (int j = 0; j < eVectors.size(); j++) {
        HMapSFW eVector = eVectors.get(j);
        HMapSIW eTfMap = eTfs.get(j);
        String eSent = eSents.get(j);
        if (i == j) {
          label = "parallel";
        } else {
          label = "non_parallel";
View Full Code Here

    if ( cmdline == null ) {
      printUsage();
      return;
    }
    BitextClassifierUtils dt = new BitextClassifierUtils();
    numSentencesPerDocE = new HMapSIW();
    numSentencesPerDocF = new HMapSIW();
    dt.runPrepareSentenceExtractionData(cmdline.getOptionValue(FLANG_OPTION), cmdline.getOptionValue(ELANG_OPTION),
        cmdline.getOptionValue(FBITEXT_OPTION), cmdline.getOptionValue(EBITEXT_OPTION), cmdline.getOptionValue(FSTOP_OPTION),
        cmdline.getOptionValue(ESTOP_OPTION), cmdline.getOptionValue(FSRC_OPTION),
        cmdline.getOptionValue(ETRG_OPTION), cmdline.getOptionValue(ESRC_OPTION), cmdline.getOptionValue(FTRG_OPTION),
        cmdline.getOptionValue(F2E_OPTION), cmdline.getOptionValue(E2F_OPTION), cmdline.getOptionValue(FTOK_OPTION),
View Full Code Here

    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), fs);
  }

  public HMapSFW createFDocVector(String sentence) {
    return createFDocVector(sentence, new HMapSIW());
  }
View Full Code Here

      return weightedVector;
    }
  }

  public HMapSFW createEDocVector(String sentence) {
    return createEDocVector(sentence, new HMapSIW());
  }
View Full Code Here

      }
      eSent = sentences[1];
      fSent = sentences[0];
      eLen = eTok.getNumberTokens(eSent);
      fLen = fTok.getNumberTokens(fSent);
      HMapSIW eSrcTfs = new HMapSIW();
      eVector = helper.createEDocVector(eSent, eSrcTfs);
      HMapSIW fSrcTfs = new HMapSIW();
      fVector = helper.createFDocVector(fSent, fSrcTfs);
  
      if (eVector == null || fVector == null) {
        reporter.incrCounter(Sentences.ignored, 1)
        return;
View Full Code Here

    Map<String,HMapSFW> scfgDist = new HashMap<String,HMapSFW>();

    // phrase2count table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
    HMapSFW phraseDist = new HMapSFW();

    HMapSIW srcTokenCnt = new HMapSIW();

    Set<String> bagOfTargetTokens = new HashSet<String>();

    try {
      FSDataInputStream fis = fs.open(new Path(grammarFile));
View Full Code Here

    Map<String,HMapSFW> token2tokenDist = new HashMap<String,HMapSFW>();
   
    // target phrase --> prob
    HMapSFW phraseDist = new HMapSFW();
   
    HMapSIW srcTokenCnt = new HMapSIW();

    Set<String> bagOfTargetTokens = new HashSet<String>();

    for (int k = 0; k < n; k++) {
      transProbs[k] = transProbs[k]/sumOfProbs;
View Full Code Here

    //    globalStatsMap = new PrefixEncodedGlobalStats(new Path(indexTermsFile), localFs);
    //    globalStatsMap.loadDFStats(new Path(dfTableFile));
  }

  public HMapSFW createFDocVector(String sentence) {
    return createFDocVector(sentence, new HMapSIW());
  }
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.io.map.HMapSIW

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.