Examples of HMapSIW

edu.umd.cloud9.io.map.HMapSIW
Writable representing a map where keys are Strings and values are ints. This class is specialized for String objects to avoid the overhead that comes with wrapping Strings inside Text objects. @author Jimmy Lin

Examples of edu.umd.cloud9.io.map.HMapSIW

    Map<String,HMapSFW> scfgDist = new HashMap<String,HMapSFW>();


    // phrase2count table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
    HMapSFW phraseDist = new HMapSFW();


    HMapSIW srcTokenCnt = new HMapSIW();


    Set<String> bagOfTargetTokens = new HashSet<String>();


    try {
      FSDataInputStream fis = fs.open(new Path(grammarFile));

View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

  private static class MyReducer extends Reducer<Text, HMapSIW, Text, HMapSIW> {
    @Override
    public void reduce(Text key, Iterable<HMapSIW> values, Context context)
        throws IOException, InterruptedException {
      Iterator<HMapSIW> iter = values.iterator();
      HMapSIW map = new HMapSIW();


      while (iter.hasNext()) {
        map.plus(iter.next());
      }


      context.write(key, map);
    }

View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

      int index = temp.indexOf(Settings.TAB);
      if (index < 0) {
        throw new IndexOutOfBoundsException("Missing title information: " + value.toString());
      }
      docTitle.set(temp.substring(0, index).trim());
      docContent = new HMapSIW();


      if (analyzer == null) {
        stk = new StringTokenizer(temp.substring(index + 1));
        while (stk.hasMoreElements()) {
          token = stk.nextToken();

View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

      }
      eSent = sentences[1];
      fSent = sentences[0];
      eLen = eTok.getNumberTokens(eSent);
      fLen = fTok.getNumberTokens(fSent);
      HMapSIW eSrcTfs = new HMapSIW();
      eVector = helper.createEDocVector(eSent, eSrcTfs);
      HMapSIW fSrcTfs = new HMapSIW();
      fVector = helper.createFDocVector(fSent, fSrcTfs);


      if (eVector == null || fVector == null) {
        reporter.incrCounter(Sentences.ignored, 1);  
        return;

View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), fs);
  }


  public HMapSFW createFDocVector(String sentence) {
    return createFDocVector(sentence, new HMapSIW());
  }

View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

      return weightedVector;
    }
  }


  public HMapSFW createEDocVector(String sentence) {
    return createEDocVector(sentence, new HMapSIW());
  }

View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

      List<HMapSIW> sentTfs, List<String> sents, HMapSIW dfTable) throws IOException {
    String line = null;
    boolean isNewDoc = true;
    int cnt = 0;
    float sumLengths = 0;
    HMapSIW sent = new HMapSIW();


    while ((line = reader.readLine()) != null) {
      line = line.trim();


      if (isNewDoc) {
        title2SentCnt.put(line, cnt);
        isNewDoc = false;
      } else if (line.equals("")){
        isNewDoc = true;       
      }else {
        String[] tokens = tokenizer.processContent(line);
        sentLengths.add(tokens.length);
        sumLengths += tokens.length;


        for (String token : tokens) {
          if (!sent.containsKey(token)) {
            dfTable.increment(token);
          }
          sent.increment(token);
        }
        sentTfs.add(sent);
        sents.add(line);
        cnt++;
        sent.clear();
      }
    } 
    reader.close();


    return (sumLengths / cnt);

View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

    float sumFLengs = 0, sumELengs = 0;


    try {
      BufferedReader dis1 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(eReadFile)), "UTF-8"));
      BufferedReader dis2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fReadFile)), "UTF-8"));
      HMapSIW fDoc = new HMapSIW();
      HMapSIW eDoc = new HMapSIW();
      String eLine = null, fLine = null;
      int cntEDocs = 0, cntFDocs = 0, lastDocLenE = 0, lastDocLenF = 0, numSents = 0;


      while ((eLine = dis1.readLine()) != null) {
        fLine = dis2.readLine().trim();
        eLine = eLine.trim();


        String[] tokens = fTokenizer.processContent(fLine);      
        lastDocLenF += tokens.length;


        for (String token : tokens) {
          if (!fDoc.containsKey(token)) { // if this is first time we saw token in this sentence
            dfD.increment(token);
          }
          fDoc.increment(token);
        }


        tokens = eTokenizer.processContent(eLine);
        lastDocLenE += tokens.length;


        for (String token : tokens) {
          if (!eDoc.containsKey(token)) {
            dfE.increment(token);
          }
          eDoc.increment(token);
        }
        
        numSents++;
        
        if (numSents == sentsPerDoc) {
          sumFLengs += lastDocLenF;
          sumELengs += lastDocLenE;


          enSentLengths.add(lastDocLenE);
          deSentLengths.add(lastDocLenF);


          eDocTfs.add(eDoc);
          fDocTfs.add(fDoc);
          cntEDocs++;
          cntFDocs++;
          
          // reset variables 
          fDoc = new HMapSIW();
          eDoc = new HMapSIW();
          numSents = 0;
          lastDocLenE = 0;
          lastDocLenF = 0;
        }
        eSents.add(eLine);

View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

    String label;
    long time = System.currentTimeMillis();


    for (int i = 0; i < transVectors.size(); i++) {
      HMapSFW transVector = transVectors.get(i);
      HMapSIW fTfMap = fTfs.get(i);
      String fSent = fSents.get(i);
      for (int j = 0; j < eVectors.size(); j++) {
        HMapSFW eVector = eVectors.get(j);
        HMapSIW eTfMap = eTfs.get(j);
        String eSent = eSents.get(j);
        if (parallelPairs.get(i) == j) {
          label = "parallel";
        } else {
          label = "non_parallel";

View Full Code Here

Examples of edu.umd.cloud9.io.map.HMapSIW

    String VOCABDIR = "/fs/clip-qa/ferhan/end2end-experiments/Ivory/data/vocab";    // /Users/ferhanture/Documents/workspace/ivory-github/Ivory/data/vocab
    String TOKENDIR = "/fs/clip-qa/ferhan/end2end-experiments/Ivory/data/tokenizer";   // "/Users/ferhanture/Documents/workspace/ivory-github/Ivory/data/tokenizer
    String DATADIR = "/fs/clip-qa/ferhan/cl-pwsim/pwsim-experiments-2013";    // /Users/ferhanture/edu/research_archive/data/de-en/eu-nc-wmt08
    
    BitextClassifierUtils dt = new BitextClassifierUtils();
    numSentencesPerDocE = new HMapSIW();
    numSentencesPerDocF = new HMapSIW();
    FileSystem localFs = FileSystem.getLocal(new Configuration());
    eVocabSrc = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.en-de.en"), localFs);
    eVocabTrg = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.de-en.en"), localFs);
    fVocabSrc = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.de-en.de"), localFs);
    fVocabTrg = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.en-de.de"), localFs);

View Full Code Here

0 1 2 3

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.