Package edu.umd.cloud9.io.map

Examples of edu.umd.cloud9.io.map.HMapSIW


    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapSFW value = new HMapSFW();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(opennlpIndex + "/test_wt-term-doc-vectors/part-00000")));

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyTermDocVector(opennlpTermDocVector1, value);

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    for (MapKF.Entry<String> entry : value.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyTermDocVector(opennlpTermDocVector2, value);
    reader.close();
  }
View Full Code Here


    Map<String,HMapSFW> scfgDist = new HashMap<String,HMapSFW>();

    // phrase2count table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
    HMapSFW phraseDist = new HMapSFW();

    HMapSIW srcTokenCnt = new HMapSIW();

    Set<String> bagOfTargetTokens = new HashSet<String>();

    try {
      FSDataInputStream fis = fs.open(new Path(grammarFile));
View Full Code Here

  private static class MyReducer extends Reducer<Text, HMapSIW, Text, HMapSIW> {
    @Override
    public void reduce(Text key, Iterable<HMapSIW> values, Context context)
        throws IOException, InterruptedException {
      Iterator<HMapSIW> iter = values.iterator();
      HMapSIW map = new HMapSIW();

      while (iter.hasNext()) {
        map.plus(iter.next());
      }

      context.write(key, map);
    }
View Full Code Here

      int index = temp.indexOf(Settings.TAB);
      if (index < 0) {
        throw new IndexOutOfBoundsException("Missing title information: " + value.toString());
      }
      docTitle.set(temp.substring(0, index).trim());
      docContent = new HMapSIW();

      if (analyzer == null) {
        stk = new StringTokenizer(temp.substring(index + 1));
        while (stk.hasMoreElements()) {
          token = stk.nextToken();
View Full Code Here

      }
      eSent = sentences[1];
      fSent = sentences[0];
      eLen = eTok.getNumberTokens(eSent);
      fLen = fTok.getNumberTokens(fSent);
      HMapSIW eSrcTfs = new HMapSIW();
      eVector = helper.createEDocVector(eSent, eSrcTfs);
      HMapSIW fSrcTfs = new HMapSIW();
      fVector = helper.createFDocVector(fSent, fSrcTfs);

      if (eVector == null || fVector == null) {
        reporter.incrCounter(Sentences.ignored, 1)
        return;
View Full Code Here

    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), fs);
  }

  public HMapSFW createFDocVector(String sentence) {
    return createFDocVector(sentence, new HMapSIW());
  }
View Full Code Here

      return weightedVector;
    }
  }

  public HMapSFW createEDocVector(String sentence) {
    return createEDocVector(sentence, new HMapSIW());
  }
View Full Code Here

      List<HMapSIW> sentTfs, List<String> sents, HMapSIW dfTable) throws IOException {
    String line = null;
    boolean isNewDoc = true;
    int cnt = 0;
    float sumLengths = 0;
    HMapSIW sent = new HMapSIW();

    while ((line = reader.readLine()) != null) {
      line = line.trim();

      if (isNewDoc) {
        title2SentCnt.put(line, cnt);
        isNewDoc = false;
      } else if (line.equals("")){
        isNewDoc = true;      
      }else {
        String[] tokens = tokenizer.processContent(line);
        sentLengths.add(tokens.length);
        sumLengths += tokens.length;

        for (String token : tokens) {
          if (!sent.containsKey(token)) {
            dfTable.increment(token);
          }
          sent.increment(token);
        }
        sentTfs.add(sent);
        sents.add(line);
        cnt++;
        sent.clear();
      }
    }
    reader.close();

    return (sumLengths / cnt);   
View Full Code Here

    float sumFLengs = 0, sumELengs = 0;

    try {
      BufferedReader dis1 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(eReadFile)), "UTF-8"));
      BufferedReader dis2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fReadFile)), "UTF-8"));
      HMapSIW fDoc = new HMapSIW();
      HMapSIW eDoc = new HMapSIW();
      String eLine = null, fLine = null;
      int cntEDocs = 0, cntFDocs = 0, lastDocLenE = 0, lastDocLenF = 0, numSents = 0;

      while ((eLine = dis1.readLine()) != null) {
        fLine = dis2.readLine().trim();
        eLine = eLine.trim();

        String[] tokens = fTokenizer.processContent(fLine);     
        lastDocLenF += tokens.length;

        for (String token : tokens) {
          if (!fDoc.containsKey(token)) { // if this is first time we saw token in this sentence
            dfD.increment(token);
          }
          fDoc.increment(token);
        }

        tokens = eTokenizer.processContent(eLine);
        lastDocLenE += tokens.length;

        for (String token : tokens) {
          if (!eDoc.containsKey(token)) {
            dfE.increment(token);
          }
          eDoc.increment(token);
        }
       
        numSents++;
       
        if (numSents == sentsPerDoc) {
          sumFLengs += lastDocLenF;
          sumELengs += lastDocLenE;

          enSentLengths.add(lastDocLenE);
          deSentLengths.add(lastDocLenF);

          eDocTfs.add(eDoc);
          fDocTfs.add(fDoc);
          cntEDocs++;
          cntFDocs++;
         
          // reset variables
          fDoc = new HMapSIW();
          eDoc = new HMapSIW();
          numSents = 0;
          lastDocLenE = 0;
          lastDocLenF = 0;
        }
        eSents.add(eLine);
View Full Code Here

    String label;
    long time = System.currentTimeMillis();

    for (int i = 0; i < transVectors.size(); i++) {
      HMapSFW transVector = transVectors.get(i);
      HMapSIW fTfMap = fTfs.get(i);
      String fSent = fSents.get(i);
      for (int j = 0; j < eVectors.size(); j++) {
        HMapSFW eVector = eVectors.get(j);
        HMapSIW eTfMap = eTfs.get(j);
        String eSent = eSents.get(j);
        if (parallelPairs.get(i) == j) {
          label = "parallel";
        } else {
          label = "non_parallel";
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.io.map.HMapSIW

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.