Examples of HMapSIW


Examples of edu.umd.cloud9.io.map.HMapSIW

    Map<String,HMapSFW> scfgDist = new HashMap<String,HMapSFW>();

    // phrase2count table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
    HMapSFW phraseDist = new HMapSFW();

    HMapSIW srcTokenCnt = new HMapSIW();

    Set<String> bagOfTargetTokens = new HashSet<String>();

    try {
      FSDataInputStream fis = fs.open(new Path(grammarFile));
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

  private static class MyReducer extends Reducer<Text, HMapSIW, Text, HMapSIW> {
    @Override
    public void reduce(Text key, Iterable<HMapSIW> values, Context context)
        throws IOException, InterruptedException {
      Iterator<HMapSIW> iter = values.iterator();
      HMapSIW map = new HMapSIW();

      while (iter.hasNext()) {
        map.plus(iter.next());
      }

      context.write(key, map);
    }
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

      int index = temp.indexOf(Settings.TAB);
      if (index < 0) {
        throw new IndexOutOfBoundsException("Missing title information: " + value.toString());
      }
      docTitle.set(temp.substring(0, index).trim());
      docContent = new HMapSIW();

      if (analyzer == null) {
        stk = new StringTokenizer(temp.substring(index + 1));
        while (stk.hasMoreElements()) {
          token = stk.nextToken();
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

      }
      eSent = sentences[1];
      fSent = sentences[0];
      eLen = eTok.getNumberTokens(eSent);
      fLen = fTok.getNumberTokens(fSent);
      HMapSIW eSrcTfs = new HMapSIW();
      eVector = helper.createEDocVector(eSent, eSrcTfs);
      HMapSIW fSrcTfs = new HMapSIW();
      fVector = helper.createFDocVector(fSent, fSrcTfs);

      if (eVector == null || fVector == null) {
        reporter.incrCounter(Sentences.ignored, 1)
        return;
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), fs);
  }

  public HMapSFW createFDocVector(String sentence) {
    return createFDocVector(sentence, new HMapSIW());
  }
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

      return weightedVector;
    }
  }

  public HMapSFW createEDocVector(String sentence) {
    return createEDocVector(sentence, new HMapSIW());
  }
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

      List<HMapSIW> sentTfs, List<String> sents, HMapSIW dfTable) throws IOException {
    String line = null;
    boolean isNewDoc = true;
    int cnt = 0;
    float sumLengths = 0;
    HMapSIW sent = new HMapSIW();

    while ((line = reader.readLine()) != null) {
      line = line.trim();

      if (isNewDoc) {
        title2SentCnt.put(line, cnt);
        isNewDoc = false;
      } else if (line.equals("")){
        isNewDoc = true;      
      }else {
        String[] tokens = tokenizer.processContent(line);
        sentLengths.add(tokens.length);
        sumLengths += tokens.length;

        for (String token : tokens) {
          if (!sent.containsKey(token)) {
            dfTable.increment(token);
          }
          sent.increment(token);
        }
        sentTfs.add(sent);
        sents.add(line);
        cnt++;
        sent.clear();
      }
    }
    reader.close();

    return (sumLengths / cnt);   
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

    float sumFLengs = 0, sumELengs = 0;

    try {
      BufferedReader dis1 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(eReadFile)), "UTF-8"));
      BufferedReader dis2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fReadFile)), "UTF-8"));
      HMapSIW fDoc = new HMapSIW();
      HMapSIW eDoc = new HMapSIW();
      String eLine = null, fLine = null;
      int cntEDocs = 0, cntFDocs = 0, lastDocLenE = 0, lastDocLenF = 0, numSents = 0;

      while ((eLine = dis1.readLine()) != null) {
        fLine = dis2.readLine().trim();
        eLine = eLine.trim();

        String[] tokens = fTokenizer.processContent(fLine);     
        lastDocLenF += tokens.length;

        for (String token : tokens) {
          if (!fDoc.containsKey(token)) { // if this is first time we saw token in this sentence
            dfD.increment(token);
          }
          fDoc.increment(token);
        }

        tokens = eTokenizer.processContent(eLine);
        lastDocLenE += tokens.length;

        for (String token : tokens) {
          if (!eDoc.containsKey(token)) {
            dfE.increment(token);
          }
          eDoc.increment(token);
        }
       
        numSents++;
       
        if (numSents == sentsPerDoc) {
          sumFLengs += lastDocLenF;
          sumELengs += lastDocLenE;

          enSentLengths.add(lastDocLenE);
          deSentLengths.add(lastDocLenF);

          eDocTfs.add(eDoc);
          fDocTfs.add(fDoc);
          cntEDocs++;
          cntFDocs++;
         
          // reset variables
          fDoc = new HMapSIW();
          eDoc = new HMapSIW();
          numSents = 0;
          lastDocLenE = 0;
          lastDocLenF = 0;
        }
        eSents.add(eLine);
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

    String label;
    long time = System.currentTimeMillis();

    for (int i = 0; i < transVectors.size(); i++) {
      HMapSFW transVector = transVectors.get(i);
      HMapSIW fTfMap = fTfs.get(i);
      String fSent = fSents.get(i);
      for (int j = 0; j < eVectors.size(); j++) {
        HMapSFW eVector = eVectors.get(j);
        HMapSIW eTfMap = eTfs.get(j);
        String eSent = eSents.get(j);
        if (parallelPairs.get(i) == j) {
          label = "parallel";
        } else {
          label = "non_parallel";
View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

    String VOCABDIR = "/fs/clip-qa/ferhan/end2end-experiments/Ivory/data/vocab";    // /Users/ferhanture/Documents/workspace/ivory-github/Ivory/data/vocab
    String TOKENDIR = "/fs/clip-qa/ferhan/end2end-experiments/Ivory/data/tokenizer";   // "/Users/ferhanture/Documents/workspace/ivory-github/Ivory/data/tokenizer
    String DATADIR = "/fs/clip-qa/ferhan/cl-pwsim/pwsim-experiments-2013";    // /Users/ferhanture/edu/research_archive/data/de-en/eu-nc-wmt08
   
    BitextClassifierUtils dt = new BitextClassifierUtils();
    numSentencesPerDocE = new HMapSIW();
    numSentencesPerDocF = new HMapSIW();
    FileSystem localFs = FileSystem.getLocal(new Configuration());
    eVocabSrc = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.en-de.en"), localFs);
    eVocabTrg = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.de-en.en"), localFs);
    fVocabSrc = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.de-en.de"), localFs);
    fVocabTrg = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.en-de.de"), localFs);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.