Package edu.stanford.nlp.process

Source Code of edu.stanford.nlp.process.WhitespaceTokenizer$WhitespaceTokenizerFactory

package edu.stanford.nlp.process;


import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.TokenizerFactory;

import java.io.*;
import java.util.Iterator;

/**
* A WhitespaceTokenizer is a tokenizer that splits on and discards only
* whitespace characters.
* This implementation returns Word objects. It has a parameter for whether
* to make EOL a token or whether to treat EOL characters as whitespace.
* If an EOL is a token, the class returns it as a Word with String value "\n".
* <p/>
* <i>Implementation note:</i> This was rewritten in Apr 2006 to discard the
* old StreamTokenizer based implementation and to replace it with a
* Unicode compliant JFlex-based version.
*
* @author Joseph Smarr (jsmarr@stanford.edu)
* @author Teg Grenager (grenager@stanford.edu)
* @author Roger Levy
* @author Christopher Manning
*/
public class WhitespaceTokenizer extends AbstractTokenizer<Word> {

  /**
   * A factory which vends WhitespaceTokenizers.
   *
   * @author Christopher Manning
   */
  public static class WhitespaceTokenizerFactory implements TokenizerFactory<Word> {

    private final boolean eolIsSignificant;

    /**
     * Constructs a new TokenizerFactory that returns Word objects and
     * treats carriage returns as normal whitespace.
     * THIS METHOD IS INVOKED BY REFLECTION BY SOME OF THE JAVANLP
     * CODE TO LOAD A TOKENIZER FACTORY.  IT SHOULD BE PRESENT IN A
     * TokenizerFactory.
     *
     * @return A TokenizerFactory that returns Word objects
     */
    public static TokenizerFactory<Word> newTokenizerFactory() {
      return new WhitespaceTokenizerFactory();
    }

    public WhitespaceTokenizerFactory() {
      this(false);
    }

    public WhitespaceTokenizerFactory(boolean eolIsSignificant) {
      this.eolIsSignificant = eolIsSignificant;
    }

    public Iterator<Word> getIterator(Reader r) {
      return getTokenizer(r);
    }

    public Tokenizer<Word> getTokenizer(Reader r) {
      return new WhitespaceTokenizer(r, eolIsSignificant);
    }

  } // end class WhitespaceTokenizerFactory


  private WhitespaceLexer lexer;
  private final boolean eolIsSignificant;

  /**
   * Internally fetches the next token.
   *
   * @return the next token in the token stream, or null if none exists.
   */
  @Override
  protected Word getNext() {
    Word token = null;
    if (lexer == null) {
      return token;
    }
    try {
      token = lexer.next();
      while (token == WhitespaceLexer.crValue) {
        if (eolIsSignificant) {
          return token;
        } else {
          token = lexer.next();
        }
      }
    } catch (IOException e) {
      // do nothing, return null
    }
    return token;
  }

  /**
   * Constructs a new WhitespaceTokenizer
   *
   * @param r The Reader r that is its source.
   */
  public WhitespaceTokenizer(Reader r) {
    this(r, false);
  }


  /**
   * Constructs a new WhitespaceTokenizer
   * @param r The Reader that is its source.
   * @param eolIsSignificant Whether eol tokens should be returned.
   */
  public WhitespaceTokenizer(Reader r, boolean eolIsSignificant) {
    this.eolIsSignificant = eolIsSignificant;
    // The conditional below is perhaps currently needed in LexicalizedParser, since
    // it passes in a null arg while doing type-checking for sentence escaping
    // but StreamTokenizer barfs on that.  But maybe shouldn't be here.
    if (r != null) {
      lexer = new WhitespaceLexer(r);
    }
  }

  /* ----
   * Sets the source of this Tokenizer to be the Reader r.

  private void setSource(Reader r) {
    lexer = new WhitespaceLexer(r);
  }
  ---- */

  public static TokenizerFactory<Word> factory() {
    return new WhitespaceTokenizerFactory(false);
  }

  public static TokenizerFactory<Word> factory(boolean eolIsSignificant) {
    return new WhitespaceTokenizerFactory(eolIsSignificant);
  }

  /**
   * Reads a file from the argument and prints its tokens one per line.
   * This is mainly as a testing aid, but it can also be quite useful
   * standalone to turn a corpus into a one token per line file of tokens.
   * <p/>
   * Usage: <code>java edu.stanford.nlp.process.WhitespaceTokenizer filename
   * </code>
   *
   * @param args Command line arguments
   * @throws IOException If can't open files, etc.
   */
  public static void main(String[] args) throws IOException {
    if (args.length < 1) {
      System.err.println("usage: java edu.stanford.nlp.process.WhitespaceTokenizer [-cr] filename");
      return;
    }
    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new InputStreamReader(new FileInputStream(args[args.length - 1]), "UTF-8"), args[0].equals("-cr"));
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true);
    while (tokenizer.hasNext()) {
      Word w = tokenizer.next();
      if (w == WhitespaceLexer.crValue) {
        pw.println("***CR***");
      } else {
        pw.println(w);
      }
    }
  }

} // end class WhitespaceTokenizer
TOP

Related Classes of edu.stanford.nlp.process.WhitespaceTokenizer$WhitespaceTokenizerFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.