Package loop.lisp

Source Code of loop.lisp.SexprTokenizer

package loop.lisp;

import loop.Token;

import java.util.ArrayList;
import java.util.List;

/**
* @author Dhanji R. Prasanna
*/
public class SexprTokenizer {
  private final String input;

  public SexprTokenizer(String input) {
    this.input = input;
  }

  private static final int NON = 0; // MUST be zero
  private static final int SINGLE_TOKEN = 1;

  private static final int[] DELIMITERS = new int[256];
  private static final boolean[] STRING_TERMINATORS = new boolean[256];

  static {
    // SINGLE token delimiters are one char in length in any context
    DELIMITERS['.'] = SINGLE_TOKEN;
    DELIMITERS[','] = SINGLE_TOKEN;
    DELIMITERS[';'] = SINGLE_TOKEN;
    DELIMITERS['('] = SINGLE_TOKEN;
    DELIMITERS[')'] = SINGLE_TOKEN;
    DELIMITERS['['] = SINGLE_TOKEN;
    DELIMITERS[']'] = SINGLE_TOKEN;
    DELIMITERS['{'] = SINGLE_TOKEN;
    DELIMITERS['}'] = SINGLE_TOKEN;

    STRING_TERMINATORS['"'] = true;
    STRING_TERMINATORS['\''] = true;
    STRING_TERMINATORS['`'] = true;
  }

  public List<Token> tokenize() {
    List<Token> tokens = new ArrayList<Token>();
    char[] input = this.input.toCharArray();

    int line = 0, column = 0;

    int i = 0, start = 0;
    boolean inWhitespace = false, inDelimiter = false, inComment = false;
    char inStringSequence = 0;
    for (; i < input.length; i++) {
      char c = input[i];
      column++;

      if (c == '\n') {
        line++;
        column = 0;
      }

      // strings and sequences
      if (STRING_TERMINATORS[c] && !inComment) {

        if (inStringSequence > 0) {

          // end of the current string sequence. bake.
          if (inStringSequence == c) {
            // +1 to include the terminating token.
            bakeToken(tokens, input, i + 1, start, line, column);
            start = i + 1;

            inStringSequence = 0; // reset to normal language
            continue;
          }
          // it's a string terminator but it's ok, it's part of the string, ignore...

        } else {
          // Also bake if there is any leading tokenage.
          if (i > start) {
            bakeToken(tokens, input, i, start, line, column);
            start = i;
          }

          inStringSequence = c; // start string
        }
      }

      // skip everything if we're in a string...
      if (inStringSequence > 0)
        continue;

      // Comments beginning with #
      if (c == ';' || c == '#') {
        inComment = true;
      }

      // We run the comment until the end of the line
      if (inComment) {
        if (c == '\n')
          inComment = false;

        start = i;
        continue;
      }

      // whitespace is ignored unless it is leading...
      if (Character.isWhitespace(c)) {
        inDelimiter = false;

        if (!inWhitespace) {
          //bake token
          bakeToken(tokens, input, i, start, line, column);
          inWhitespace = true;
        }

        // skip whitespace
        start = i + 1;
        continue;
      }

      // any non-whitespace character encountered
      inWhitespace = false;

      // For delimiters that are 1-char long in all contexts,
      // break early.
      if (isSingleTokenDelimiter(c)) {

        bakeToken(tokens, input, i, start, line, column);
        start = i;

        // Also add the delimiter.
        bakeToken(tokens, input, i + 1, start, line, column);
        start = i + 1;
        continue;
      }

      // is delimiter
      if (isDelimiter(c)) {

        if (!inDelimiter) {
          bakeToken(tokens, input, i, start, line, column);
          inDelimiter = true;
          start = i;
        }

        continue;
      }

      // if coming out of a delimiter, we still need to bake
      if (inDelimiter) {
        bakeToken(tokens, input, i, start, line, column);
        start = i;
        inDelimiter = false;
      }
    }

    // collect residual token
    if (i > start && !inComment) {
      // we don't want trailing whitespace
      bakeToken(tokens, input, i, start, line, column);
    }

    return tokens;
  }


  static boolean isSingleTokenDelimiter(char c) {
    return DELIMITERS[c] == SINGLE_TOKEN;
  }

  private static boolean isDelimiter(char c) {
    return DELIMITERS[c] != NON;
  }

  private static void bakeToken(List<Token> tokens,
                                char[] input,
                                int i,
                                int start,
                                int line,
                                int column) {
    if (i > start) {
      String value = new String(input, start, i - start);

      // Hack to elide whitespace that sneaks in due to the leading whitespace detector.
      if (value.trim().isEmpty())
        return;

      // remove this disgusting hack when you can fix the lexer.
      tokens.add(new Token(value, Token.Kind.determine(value), line, column));
    }
  }
}
TOP

Related Classes of loop.lisp.SexprTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.