Package cc.mallet.types

Examples of cc.mallet.types.TokenSequence


    return (String[]) wordarray.toArray(new String[]{});
  }
 
  public Instance pipe (Instance carrier)
  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    // xxx This doesn't seem so efficient.  Perhaps have TokenSequence
    // use a LinkedList, and remove Tokens from it? -?
    // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM
    TokenSequence ret = new TokenSequence ();
    Token prevToken = null;
    for (int i = 0; i < ts.size(); i++) {
      Token t = ts.get(i);
      if (! stoplist.contains (caseSensitive ? t.getText() : t.getText().toLowerCase())) {
        // xxx Should we instead make and add a copy of the Token?
        ret.add (t);
        prevToken = t;
      } else if (markDeletions && prevToken != null)
        prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText());
    }
    carrier.setData(ret);
View Full Code Here


    do {
      count = br.read (buf, 0, BUFSIZE);
      sb.append (buf);
    } while (count == BUFSIZE);
    lexer.setCharSequence ((CharSequence)sb);
    TokenSequence ts = new TokenSequence ();
    while (lexer.hasNext())
      ts.add (new Token ((String) lexer.next()));
    return ts;
  }
View Full Code Here

    Transducer model = tt.getTransducer();
    int numCorrectTokens, totalTokens;
    int numTrueSegments, numPredictedSegments, numCorrectSegments;
    int numCorrectSegmentsInAlphabet, numCorrectSegmentsOOV;
    int numIncorrectSegmentsInAlphabet, numIncorrectSegmentsOOV;
    TokenSequence sourceTokenSequence = null;

    totalTokens = numCorrectTokens = 0;
    numTrueSegments = numPredictedSegments = numCorrectSegments = 0;
    numCorrectSegmentsInAlphabet = numCorrectSegmentsOOV = 0;
    numIncorrectSegmentsInAlphabet = numIncorrectSegmentsOOV = 0;
View Full Code Here

    super(new Alphabet(), null);
  }
 
  public Instance pipe (Instance carrier)
  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    FeatureSequence ret =
      new FeatureSequence ((Alphabet)getDataAlphabet(), ts.size());
    for (int i = 0; i < ts.size(); i++) {
      ret.add (ts.get(i).getText());
    }
    carrier.setData(ret);
    return carrier;
  }
View Full Code Here

    for (int i = 0; i < instances.size(); i++) {
      if (viterbiOutputStream != null)
        viterbiOutputStream.println ("Viterbi path for "+description+" instance #"+i);
      Instance instance = instances.get(i);
      Sequence input = (Sequence) instance.getData();
      TokenSequence sourceTokenSequence = null;
      if (instance.getSource() instanceof TokenSequence)
        sourceTokenSequence = (TokenSequence) instance.getSource();

      Sequence trueOutput = (Sequence) instance.getTarget();
      assert (input.size() == trueOutput.size());
      Sequence predOutput = transducerTrainer.getTransducer().transduce (input);
      assert (predOutput.size() == trueOutput.size());
     
      for (int j = 0; j < trueOutput.size(); j++) {
        FeatureVector fv = (FeatureVector) input.get(j);
        //viterbiOutputStream.println (tokens.charAt(j)+" "+trueOutput.get(j).toString()+
        //'/'+predOutput.get(j).toString()+"  "+ fv.toString(true));
        if (sourceTokenSequence != null)
          viterbiOutputStream.print (sourceTokenSequence.get(j).getText()+": ");
        viterbiOutputStream.println (trueOutput.get(j).toString()+
            '/'+predOutput.get(j).toString()+"  "+ fv.toString(true));
      }
    }
  }
View Full Code Here

    public TestCRFTokenSequenceRemoveSpaces() {
      super(null, new Alphabet());
    }

    public Instance pipe(Instance carrier) {
      TokenSequence ts = (TokenSequence) carrier.getData();
      TokenSequence newTs = new TokenSequence();
      FeatureSequence labelSeq = new FeatureSequence(getTargetAlphabet());
      boolean lastWasSpace = true;
      StringBuffer sb = new StringBuffer();
      for (int i = 0; i < ts.size(); i++) {
        Token t = ts.get(i);
        if (t.getText().equals(" "))
          lastWasSpace = true;
        else {
          sb.append(t.getText());
          newTs.add(t);
          labelSeq.add(lastWasSpace ? "start" : "notstart");
          lastWasSpace = false;
        }
      }
      if (isTargetProcessing())
View Full Code Here

  public Instance pipe (Instance carrier)
  {
    CharSequence string = (CharSequence) carrier.getData();
    lexer.setCharSequence (string);
    TokenSequence ts = new StringTokenization (string);
    while (lexer.hasNext()) {
      lexer.next();
      ts.add (new StringSpan (string, lexer.getStartOffset (), lexer.getEndOffset ()));
    }
    carrier.setData(ts);
    return carrier;
  }
View Full Code Here

        Instance carrier = new Instance (new File(args[i]), null, null, null);
        SerialPipes p = new SerialPipes (new Pipe[] {
          new Input2CharSequence (),
          new CharSequence2TokenSequence(new CharSequenceLexer())});
        carrier = p.newIteratorFrom (new SingleInstanceIterator(carrier)).next();
        TokenSequence ts = (TokenSequence) carrier.getData();
        System.out.println ("===");
        System.out.println (args[i]);
        System.out.println (ts.toString());
      }
    } catch (Exception e) {
      System.out.println (e);
      e.printStackTrace();
    }
View Full Code Here

public class TokenSequenceLowercase extends Pipe implements Serializable
{
 
  public Instance pipe (Instance carrier)
  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    for (int i = 0; i < ts.size(); i++) {
      Token t = ts.get(i);
      t.setText(t.getText().toLowerCase());
    }
    return carrier;
  }
View Full Code Here

    return this;
  }
 
  public Instance pipe (Instance carrier) {
   
    TokenSequence originalSequence = (TokenSequence) carrier.getData();
    TokenSequence newSequence = new TokenSequence();
   
    for (int i = 0; i < originalSequence.size(); i++) {
      Token t = originalSequence.get(i);
     
      boolean passed = true;
      String text = t.getText();
      for (Pattern pattern : stopPatterns) {
        Matcher matcher = pattern.matcher(text);
        if (matcher.matches()) {
          passed = false;
          break;
        }
      }
     
      if (passed) {
        newSequence.add (t);
      }
    }
   
    carrier.setData(newSequence);
    return carrier;
View Full Code Here

TOP

Related Classes of cc.mallet.types.TokenSequence

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.