Package cc.mallet.types

Examples of cc.mallet.types.Token


        while (lexer.hasNext()) {
          lexer.next ();
          int tokStart = textStart + lexer.getStartOffset ();
          int tokEnd = textStart + lexer.getEndOffset ();
          dataTokens.add (new StringSpan (string, tokStart, tokEnd));
          targetTokens.add (new Token (tag));
        }
      }
      textStart = nextStart;
      tag = nextTag;
    }
View Full Code Here


      String header = ts.get(1).getText();
      if (header.equals("PRESS"))        // Don't bother with "PRESS DIGEST" headers
        return carrier;
      String featureName = "HEADER="+header;
      for (int i = 0; i < ts.size(); i++) {
        Token t = ts.get(i);
        // Only apply this feature to capitalized words, because if we apply it to everything
        // we easily get an immense number of possible feature conjunctions, (e.g. every word
        // with each of these HEADER= features.
        if (t.getText().matches("^[A-Z].*"))
          t.setFeatureValue (featureName, 1.0);
      }
    }
    return carrier;
  }
View Full Code Here

      StringBuffer buf = new StringBuffer ();
      TokenSequence ts = (TokenSequence) data;
      StringTokenization spans = new StringTokenization (buf)// I can use a StringBuffer as the doc! Awesome!

      for (int i = 0; i < ts.size(); i++) {
        Token token = ts.get(i);

        int start = buf.length ();
        buf.append (token.getText());
        int end = buf.length();

        StringSpan span = new StringSpan (buf, start, end);
        span.setFeatures (token.getFeatures ());
        span.setProperties (token.getProperties ());

        spans.add (span);
        buf.append (" ");
      }
View Full Code Here

    String newTerm = null;
    TokenSequence tmpTS = new TokenSequence();
    TokenSequence ts = (TokenSequence) carrier.getData();

    for (int i = 0; i < ts.size(); i++) {
      Token t = ts.get(i);
      for(int j = 0; j < gramSizes.length; j++) {
        int len = gramSizes[j];
        if (len <= 0 || len > (i+1)) continue;
        if (len == 1) { tmpTS.add(t); continue; }
        newTerm = new String(t.getText());
        for(int k = 1; k < len; k++)
          newTerm = ts.get(i-k).getText() + "_" + newTerm;
        tmpTS.add(newTerm);
      }
    }
View Full Code Here

        textEnd = m.start();
      }
      if (textEnd - textStart > 0) {
        lexer.setCharSequence (string.subSequence (textStart, textEnd));
        while (lexer.hasNext()) {
          dataTokens.add (new Token ((String) lexer.next()));
          targetTokens.add (new Token (tag));
        }
      }
      textStart = nextStart;
      tag = nextTag;
    }
View Full Code Here

  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    int tsSize = ts.size();
    PropertyList[] newFeatures = new PropertyList[tsSize];
    for (int i = 0; i < tsSize; i++) {
      Token t = ts.get (i);
      PropertyList pl = t.getFeatures();
      newFeatures[i] = pl;
      for (int position = i + leftBoundary; position < i + rightBoundary; position++) {
        if (position == i && !includeCurrentToken)
          continue;
        PropertyList pl2;
View Full Code Here

  public Instance pipe (Instance carrier)
  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    int tsSize = ts.size();
    for (int i = tsSize-1; i >= 0; i--) {
      Token t = ts.get (i);
      String text = t.getText();
      if (featureRegex != null && !featureRegex.matcher(text).matches())
        continue;
      for (int j = 0; j < i; j++) {
        if (ts.get(j).getText().equals(text)) {
          PropertyList.Iterator iter = ts.get(j).getFeatures().iterator();
          while (iter.hasNext()) {
            iter.next();
            String key = iter.getKey();
            if (filterRegex == null || (filterRegex.matcher(key).matches() ^ !includeFiltered))
              t.setFeatureValue (namePrefix+key, iter.getNumericValue());
          }
          break;
        }
        if (firstMentionName != null)
          t.setFeatureValue (firstMentionName, 1.0);
      }
    }
    return carrier;
  }
View Full Code Here

 
  public Instance pipe (Instance carrier) {
    TokenSequence ts = (TokenSequence) carrier.getData ();
    for (int i=0; i < ts.size(); i++) {
      Token t = ts.get (i);
      String[] values = t.getText().split("\\s+");
      for (int j=0; j < values.length; j++) {
        if (specifyFeatureNames) {
          String[] nameAndValue = values[j].split(nameValueSeparator);           
          if (nameAndValue.length != 2) { // no feature name. use token as feature.
            t.setFeatureValue ("Token="+values[j], 1.0);
          }
          else {
            t.setFeatureValue (nameAndValue[0], Double.parseDouble (nameAndValue[1]));           
          }
        }
        else if (realValued) {
          t.setFeatureValue ("Feature#" + j, Double.parseDouble (values[j]));
        }
        else
          t.setFeatureValue (values[j], 1.0);         
      }
    }
    carrier.setData (ts);
    return carrier;
  }
View Full Code Here

  public Instance pipe (Instance carrier)
  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    TokenSequence targetTokenSeq = new TokenSequence (ts.size());
    for (int i = 0; i < ts.size(); i++) {
      Token t = ts.get(i);
      Matcher matcher = regex.matcher (t.getText());
      if (matcher.matches()) {
        targetTokenSeq.add (matcher.group(targetGroup));
        t.setText (matcher.group (dataGroup));
      } else {
        logger.warning ("Skipping token: No match of "+regex.pattern()
                        +" at token #"+i+" with text "+t.getText());
      }
    }
    carrier.setTarget(targetTokenSeq);
    carrier.setData(ts);
    return carrier;
View Full Code Here

    TokenSequence ts = (TokenSequence) carrier.getData();
    // xxx This doesn't seem so efficient.  Perhaps have TokenSequence
    // use a LinkedList, and remove Tokens from it? -?
    // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM
    TokenSequence ret = new TokenSequence ();
    Token prevToken = null;
    for (int i = 0; i < ts.size(); i++) {
      Token t = ts.get(i);
      String s = t.getText();
      if (CharSequenceLexer.LEX_ALPHA.matcher(s).matches()) {
        ret.add (t);
        prevToken = t;
      else if (markDeletions && prevToken != null)
        prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText());
    }
    carrier.setData(ret);
    return carrier;
  }
View Full Code Here

TOP

Related Classes of cc.mallet.types.Token

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.