Package edu.umd.cloud9.io.pair

Examples of edu.umd.cloud9.io.pair.PairOfStrings$Comparator


    List<PairOfWritables<PairOfStrings, FloatWritable>> list1 = Lists.newArrayList();
    List<PairOfWritables<PairOfStrings, FloatWritable>> list2 = Lists.newArrayList();

    for (PairOfWritables<PairOfStrings, FloatWritable> p : pairs) {
      PairOfStrings bigram = p.getLeftElement();

      if (bigram.getLeftElement().equals("light")) {
        list1.add(p);
      }
      if (bigram.getLeftElement().equals("contain")) {
        list2.add(p);
      }
    }

    Collections.sort(list1, new Comparator<PairOfWritables<PairOfStrings, FloatWritable>>() {
      public int compare(PairOfWritables<PairOfStrings, FloatWritable> e1,
          PairOfWritables<PairOfStrings, FloatWritable> e2) {
        if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
          return e1.getLeftElement().compareTo(e2.getLeftElement());
        }

        return e2.getRightElement().compareTo(e1.getRightElement());
      }
    });

    Iterator<PairOfWritables<PairOfStrings, FloatWritable>> iter1 =
        Iterators.limit(list1.iterator(), 10);
    while (iter1.hasNext()) {
      PairOfWritables<PairOfStrings, FloatWritable> p = iter1.next();
      PairOfStrings bigram = p.getLeftElement();
      System.out.println(bigram + "\t" + p.getRightElement());
    }

    Collections.sort(list2, new Comparator<PairOfWritables<PairOfStrings, FloatWritable>>() {
      public int compare(PairOfWritables<PairOfStrings, FloatWritable> e1,
          PairOfWritables<PairOfStrings, FloatWritable> e2) {
        if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
          return e1.getLeftElement().compareTo(e2.getLeftElement());
        }

        return e2.getRightElement().compareTo(e1.getRightElement());
      }
    });

    Iterator<PairOfWritables<PairOfStrings, FloatWritable>> iter2 =
        Iterators.limit(list2.iterator(), 10);
    while (iter2.hasNext()) {
      PairOfWritables<PairOfStrings, FloatWritable> p = iter2.next();
      PairOfStrings bigram = p.getLeftElement();
      System.out.println(bigram + "\t" + p.getRightElement());
    }
  }
View Full Code Here


    IntegrationUtils.exec(Joiner.on(" ").join(args));

    SequenceFile.Reader reader = new SequenceFile.Reader(conf,
        SequenceFile.Reader.file(new Path(tmpPrefix + "-base/part-r-00000")));

    PairOfStrings pair = new PairOfStrings();
    FloatWritable f = new FloatWritable();

    reader.next(pair, f);
    assertEquals("&c", pair.getLeftElement());
    assertEquals("*", pair.getRightElement());
    assertEquals(17f, f.get(), 10e-6);

    for (int i = 0; i < 100; i++) {
      reader.next(pair, f);
    }

    assertEquals("'dear", pair.getLeftElement());
    assertEquals("*", pair.getRightElement());
    assertEquals(2f, f.get(), 10e-6);

    reader.next(pair, f);
    assertEquals("'dear", pair.getLeftElement());
    assertEquals("lord", pair.getRightElement());
    assertEquals(1f, f.get(), 10e-6);

    reader.close();
  }
View Full Code Here

      int e = entry.getRightElement();
      String eTerm = eVocab_f2e.get(e);

      //      LOG.info("Pr("+eTerm+"|"+token+")="+probEF);

      if (probEF > 0 && e > 0 && !docLangTokenizer.isStopWord(eTerm) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {     
        // assuming our bilingual dictionary is learned from normally segmented text, but we want to use bigram tokenizer for CLIR purposes
        // then we need to convert the translations of each source token into a sequence of bigrams
        // we can distribute the translation probability equally to the each bigram
        if (bigramSegment) {
          String[] eTokens = docLangTokenizer.processContent(eTerm);
View Full Code Here

        String[] parts = rule.split("\\|\\|\\|");
        String[] lhs = parts[1].trim().split(" ");
        String[] rhs = parts[2].trim().split(" ");;
        for (String l : lhs) {
          for (String r : rhs) {
            pairsInSCFG.add(new PairOfStrings(l, r));
          }
        }
      }
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
View Full Code Here

        String[] parts = rule.split("\\|\\|\\|");
        String[] lhs = parts[0].trim().split(" ");
        String[] rhs = parts[1].trim().split(" ");;
        for (String l : lhs) {
          for (String r : rhs) {
            pairsInSCFG.add(new PairOfStrings(l, r));
          }
        }
      }
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
View Full Code Here

      int e = entry.getRightElement();
      String eTerm = eVocab_f2e.get(e);

      //      LOG.info("Pr("+eTerm+"|"+token+")="+probEF);

      if (probEF > 0 && e > 0 && !docLangTokenizer.isStopWord(eTerm) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {     
        // assuming our bilingual dictionary is learned from normally segmented text, but we want to use bigram tokenizer for CLIR purposes
        // then we need to convert the translations of each source token into a sequence of bigrams
        // we can distribute the translation probability equally to the each bigram
        if (bigramSegment) {
          String[] eTokens = docLangTokenizer.processContent(eTerm);
View Full Code Here

      int e = entry.getRightElement();
      String eTerm = eVocab_f2e.get(e);

      //      LOG.info("Pr("+eTerm+"|"+token+")="+probEF);

      if (probEF > 0 && e > 0 && !docLangTokenizer.isStemmedStopWord(eTerm) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {     
        // assuming our bilingual dictionary is learned from normally segmented text, but we want to use bigram tokenizer for CLIR purposes
        // then we need to convert the translations of each source token into a sequence of bigrams
        // we can distribute the translation probability equally to the each bigram
        if (bigramSegment) {
          String[] eTokens = docLangTokenizer.processContent(eTerm);
View Full Code Here

        String[] parts = rule.split("\\|\\|\\|");
        String[] lhs = parts[1].trim().split(" ");
        String[] rhs = parts[2].trim().split(" ");;
        for (String l : lhs) {
          for (String r : rhs) {
            pairsInSCFG.add(new PairOfStrings(l, r));
            //            LOG.info("added "+l+"|||"+r);
          }
        }
      }
    } catch (UnsupportedEncodingException e) {
View Full Code Here

      int e = entry.getRightElement();
      String eTerm = eVocab_f2e.get(e);

      //      LOG.info("Pr("+eTerm+"|"+token+")="+probEF);

      if (probEF > 0 && e > 0 && !docLangTokenizer.isStemmedStopWord(eTerm) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {     
        // assuming our bilingual dictionary is learned from normally segmented text, but we want to use bigram tokenizer for CLIR purposes
        // then we need to convert the translations of each source token into a sequence of bigrams
        // we can distribute the translation probability equally to the each bigram
        if (bigramSegment) {
          String[] eTokens = docLangTokenizer.processContent(eTerm);
View Full Code Here

        String[] parts = rule.split("\\|\\|\\|");
        String[] lhs = parts[1].trim().split(" ");
        String[] rhs = parts[2].trim().split(" ");;
        for (String l : lhs) {
          for (String r : rhs) {
            pairsInSCFG.add(new PairOfStrings(l, r));
          }
        }
      }
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.io.pair.PairOfStrings$Comparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.