Source Code of hampi.grammars.lexer.Lexer

/*******************************************************************************
 * The MIT License
 * 
 * Copyright (c) 2008 Adam Kiezun
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 ******************************************************************************/
package hampi.grammars.lexer;


import hampi.grammars.apps.GramgenException;
import hampi.utils.Files;


import java.io.IOException;
import java.util.*;


/**
 * Reads the input file and returns a stream of tokens.
 */
public final class Lexer{


  private char[]            m_data;
  private int               m_count;
  private int               m_size;
  private int               m_currentPtr;
  private int               m_line;
  private final List<Token> m_tokens;


  // Create the lexer from the file given by fname
  public Lexer(String fname) throws IOException{
    m_currentPtr = 0;
    m_count = 0;
    m_line = 1;
    m_tokens = new ArrayList<Token>();


    readFile(fname);
  }


  // How many tokens have been read so far.
  public int getReadTokenCount(){
    return m_count;
  }


  // Prints token names .
  public String tokenString(){
    StringBuilder sb = new StringBuilder();
    Token t = next();
    while (t.m_kind != TokenKind.TEOF){
      sb.append(t.toString());
      sb.append(" ");
      t = next();
    }
    sb.append(t.toString());
    sb.append("\n");
    return sb.toString();
  }


  // Read in the file in one big chunk.
  public void readFile(String fname) throws IOException{
    String fileContents = Files.getFileContents(fname);
    if (fileContents == null){
      System.out.printf("ERROR: unable to open file %s\n", fname);
      throw new GramgenException(1);
    }


    m_size = fileContents.length();
    m_data = fileContents.toCharArray();
  }


  private Token createSimpleToken(TokenKind k){
    Token token = new SimpleToken(k, m_line);
    addToken(token);
    return token;
  }


  private Token createNonterminalToken(int offset, int size){
    String str = new String(m_data, offset, size);// name of the nonterminal
    Token token = new NonterminalToken(str, m_line);
    addToken(token);
    return token;
  }


  private Token createTerminalToken(int offset, int size){
    String str = new String(m_data, offset, size);// name of the terminal
    Token token = new TerminalToken(str, m_line);
    addToken(token);
    return token;
  }


  private Token createSpecialToken(int offset, int size){
    String str = new String(m_data, offset, size);// name of the special token
    Token token = new SpecialToken(str, m_line);
    addToken(token);
    return token;
  }


  private void addToken(Token token){
    m_tokens.add(token);
  }


  // consumes white space characters and update the line number counter.
  private void eatWS(){
    while (m_currentPtr < m_size && (m_data[m_currentPtr] == '\n' || m_data[m_currentPtr] == '\r' || m_data[m_currentPtr] == '\t' || m_data[m_currentPtr] == ' ')){
      if (m_currentPtr < m_size - 1 && m_data[m_currentPtr] == '\r' && m_data[m_currentPtr + 1] == '\n'){
        m_line++; // NOTE works only for LF+CR newlines
      }


      m_currentPtr++;
    }
  }


  // Creates and returns the terminal token that starts at the current
  // position.
  private Token readTerminal(){
    // scan until unescaped double quote
    int i = m_currentPtr + 1;


    for (; i < m_size; i++){
      if (m_data[i] == '"' && i == 0){
        break;
      }
      if (m_data[i] == '"' && m_data[i - 1] != '\\'){
        break;
      }
    }


    // now i points one past the quote (or exactly one past the end in case
    // of EOF)
    int size = i - m_currentPtr + 1;
    int offset = m_currentPtr;
    m_currentPtr += size;


    return createTerminalToken(offset, size);
  }


  // Creates and returns the special token that starts at the current
  // position.
  private Token readSpecial(){
    // scan until closing angle bracket
    int i = m_currentPtr;


    for (; i < m_size; i++){
      if (m_data[i] == '>'){
        break;
      }
    }


    // now i points one past the '>' (or exactly one past the end in case of
    // EOF)
    int size = i - m_currentPtr + 1;
    int offset = m_currentPtr;
    m_currentPtr += size;


    return createSpecialToken(offset, size);
  }


  // Creates and returns the nonterminal token that starts at the current
  // position.
  private Token readNonterminal(){
    // scan until not alpha
    int i = m_currentPtr;


    for (; i < m_size; i++){
      boolean isFirstCharacter = i == m_currentPtr;
      if (isFirstCharacter){
        if (!isAlphaOrUnderscore(m_data[i])){
          break;
        }
      }else{
        if (!isAlphaOrUnderscore(m_data[i]) && !isDigit(m_data[i])){
          break;
        }
      }
    }


    // now i points one past the name (or exactly one past the end in case
    // of EOF)
    int size = i - m_currentPtr;
    int offset = m_currentPtr;
    m_currentPtr += size;


    return createNonterminalToken(offset, size);
  }


  // Returns the kind of the next token, without advancing the current-token
  // pointer.
  public TokenKind lookahead(){
    // save state, read token, restore state


    int size = m_size;
    int currentPtr = m_currentPtr;
    int line = m_line;


    Token t = next();


    m_size = size;
    m_currentPtr = currentPtr;
    m_line = line;
    if (!m_tokens.isEmpty()){
      m_tokens.remove(m_tokens.size() - 1);
      m_count--;
    }


    TokenKind result = t.m_kind;
    return result;
  }


  // Returns the next token in the stream. If you read past the end, it will
  // keep returning TEOF.
  public Token next(){
    eatWS();
    m_count++;


    if (m_currentPtr >= m_size)
      return createSimpleToken(TokenKind.TEOF);


    char ch = m_data[m_currentPtr];
    switch (ch){
    case '(': {
      m_currentPtr++;
      return createSimpleToken(TokenKind.TOPENP);
    }
    case ')': {
      m_currentPtr++;
      return createSimpleToken(TokenKind.TCLOSEP);
    }
    case '|': {
      m_currentPtr++;
      return createSimpleToken(TokenKind.TBAR);
    }
    case '+': {
      m_currentPtr++;
      return createSimpleToken(TokenKind.TPLUS);
    }
    case '?': {
      m_currentPtr++;
      return createSimpleToken(TokenKind.TOPTION);
    }
    case '*': {
      m_currentPtr++;
      return createSimpleToken(TokenKind.TSTAR);
    }
    case '=': {
      m_currentPtr++;
      return createSimpleToken(TokenKind.TEQUALS);
    }
    case ';': {
      m_currentPtr++;
      return createSimpleToken(TokenKind.TSEMICOLON);
    }
    case '"': {
      return readTerminal();// counter increased inside based on size
    }
    case '<': {
      return readSpecial();// counter increased inside based on size
    }
    default: {
      if (isAlphaOrUnderscore(ch))
        return readNonterminal();// counter increased inside based on
      // size
      else{
        System.out.printf("Next: unexpected character '%c' at position %d line %d\n", ch, m_currentPtr, m_line);
        throw new GramgenException(1);
      }
    }
    }
  }


  private boolean isAlphaOrUnderscore(char ch){
    return ch == '_' || ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z';
  }


  private boolean isDigit(char ch){
    return ch >= '0' && ch <= '9';
  }
}
Source Code of hampi.grammars.lexer.Lexer

Related Classes of hampi.grammars.lexer.Lexer