Package edu.stanford.nlp.process

Source Code of edu.stanford.nlp.process.LexerTokenizer

package edu.stanford.nlp.process;


import edu.stanford.nlp.io.Lexer;
import edu.stanford.nlp.io.RuntimeIOException;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;


/**
* An implementation of {@link Tokenizer} designed to work with
* {@link Lexer} implementing classes.  Throw in a {@link Lexer} on
* construction and you get a {@link Tokenizer}.
*
* @author Roger Levy
*/
public class LexerTokenizer extends AbstractTokenizer<String> {

  private Lexer lexer;

  /**
   * Internally fetches the next token.
   *
   * @return the next token in the token stream, or null if none exists.
   */
  @Override
  protected String getNext() {
    String token = null;
    try {
      int a = Lexer.IGNORE;
      while ((a = lexer.yylex()) == Lexer.IGNORE) {
        ; // skip tokens to be ignored
      }
      if (a == lexer.getYYEOF()) {
        token = null;
      } else {
        token = lexer.yytext();
      }
    } catch (IOException e) {
      // do nothing, return null
    }
    return token;
  }

  /* Constructs a tokenizer from a {@link Lexer}
   */
  public LexerTokenizer(Lexer l) {
    if (l == null) {
      throw new IllegalArgumentException("You can't make a Tokenizer out of a null Lexer!");
    } else {
      this.lexer = l;
    }
  }

  /* Constructs a tokenizer from a {@link Lexer} and makes a {@link
   * Reader} the active input stream for the tokenizer.
   */
  public LexerTokenizer(Lexer l, Reader r) {
    this(l);

    try {
      l.yyreset(r);
    } catch (IOException e) {
      throw new RuntimeIOException(e.getMessage());
    }

    getNext();
  }


  /**
   * for testing only
   */
  public static void main(String[] args) throws IOException {
    Tokenizer<String> t = new LexerTokenizer(new JFlexDummyLexer((Reader) null), new BufferedReader(new FileReader(args[0])));
    while (t.hasNext()) {
      System.out.println("token " + t.next());
    }
  }

}
TOP

Related Classes of edu.stanford.nlp.process.LexerTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.