Package cc.mallet.types

Examples of cc.mallet.types.Token


    TokenSequence ts = (TokenSequence) carrier.getData();
    // xxx This doesn't seem so efficient.  Perhaps have TokenSequence
    // use a LinkedList, and remove Tokens from it? -?
    // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM
    TokenSequence ret = new TokenSequence ();
    Token prevToken = null;
    for (int i = 0; i < ts.size(); i++) {
      Token t = ts.get(i);
      if (! stoplist.contains (caseSensitive ? t.getText() : t.getText().toLowerCase())) {
        // xxx Should we instead make and add a copy of the Token?
        ret.add (t);
        prevToken = t;
      } else if (markDeletions && prevToken != null)
        prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText());
    }
    carrier.setData(ret);
    return carrier;
  }
View Full Code Here


      sb.append (buf);
    } while (count == BUFSIZE);
    lexer.setCharSequence ((CharSequence)sb);
    TokenSequence ts = new TokenSequence ();
    while (lexer.hasNext())
      ts.add (new Token ((String) lexer.next()));
    return ts;
  }
View Full Code Here

    super (name);
  }
 
  public void testOne ()
  {
    Token t = new Token ("foo");

    t.setProperty ("color", "red");
    t.setProperty ("font", "TimesRoman");

    t.setFeatureValue ("length", 3);
    t.setFeatureValue ("containsVowel", 1);
    t.setFeatureValue ("in /usr/dict/words", 0);

    Alphabet dict = new Alphabet();
    FeatureVector fv = t.toFeatureVector (dict, false);
    assertTrue (fv.numLocations() == 2);
    assertTrue (fv.value (dict.lookupIndex("length")) == 3);
  }
View Full Code Here

      TokenSequence newTs = new TokenSequence();
      FeatureSequence labelSeq = new FeatureSequence(getTargetAlphabet());
      boolean lastWasSpace = true;
      StringBuffer sb = new StringBuffer();
      for (int i = 0; i < ts.size(); i++) {
        Token t = ts.get(i);
        if (t.getText().equals(" "))
          lastWasSpace = true;
        else {
          sb.append(t.getText());
          newTs.add(t);
          labelSeq.add(lastWasSpace ? "start" : "notstart");
          lastWasSpace = false;
        }
      }
View Full Code Here

 
  public Instance pipe (Instance carrier)
  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    for (int i = 0; i < ts.size(); i++) {
      Token t = ts.get(i);
      t.setText(t.getText().toLowerCase());
    }
    return carrier;
  }
View Full Code Here

   
    TokenSequence originalSequence = (TokenSequence) carrier.getData();
    TokenSequence newSequence = new TokenSequence();
   
    for (int i = 0; i < originalSequence.size(); i++) {
      Token t = originalSequence.get(i);
     
      boolean passed = true;
      String text = t.getText();
      for (Pattern pattern : stopPatterns) {
        Matcher matcher = pattern.matcher(text);
        if (matcher.matches()) {
          passed = false;
          break;
View Full Code Here

        while (lexer.hasNext()) {
          lexer.next ();
          int tokStart = textStart + lexer.getStartOffset ();
          int tokEnd = textStart + lexer.getEndOffset ();
          dataTokens.add (new StringSpan (string, tokStart, tokEnd));
          targetTokens.add (new Token (tag));
        }
      }
      textStart = nextStart;
      tag = nextTag;
    }
View Full Code Here

            String newTerm = null;
            TokenSequence tmpTS = new TokenSequence();
            TokenSequence ts = (TokenSequence) carrier.getData();

            for (int i = 0; i < ts.size(); i++) {
                Token t = ts.get(i);
                for(int j = 0; j < gramSizes.length; j++) {
                    int len = gramSizes[j];
                    if (len <= 0 || len > (i+1)) continue;
                    if (len == 1) { tmpTS.add(t); continue; }
                    newTerm = new String(t.getText());
                    for(int k = 1; k < len; k++)
                        newTerm = ts.get(i-k).getText() + delim + newTerm;
                    tmpTS.add(newTerm);
                }
            }
View Full Code Here

TOP

Related Classes of cc.mallet.types.Token

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.