Source Code of org.stringtemplate.v4.compiler.STLexer$STToken

/*
 * [The "BSD license"]
 *  Copyright (c) 2011 Terence Parr
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  3. The name of the author may not be used to endorse or promote products
 *     derived from this software without specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.stringtemplate.v4.compiler;


import org.antlr.runtime.CharStream;
import org.antlr.runtime.CommonToken;
import org.antlr.runtime.MismatchedTokenException;
import org.antlr.runtime.NoViableAltException;
import org.antlr.runtime.RecognitionException;
import org.antlr.runtime.Token;
import org.antlr.runtime.TokenSource;
import org.stringtemplate.v4.STGroup;
import org.stringtemplate.v4.misc.ErrorManager;
import org.stringtemplate.v4.misc.Misc;


import java.util.ArrayList;
import java.util.List;


/**
 * This class represents the tokenizer for templates. It operates in two modes:
 * inside and outside of expressions. It implements the {@link TokenSource}
 * interface so it can be used with ANTLR parsers. Outside of expressions, we
 * can return these token types: {@link #TEXT}, {@link #INDENT}, {@link #LDELIM}
 * (start of expression), {@link #RCURLY} (end of subtemplate), and
 * {@link #NEWLINE}. Inside of an expression, this lexer returns all of the
 * tokens needed by {@link STParser}. From the parser's point of view, it can
 * treat a template as a simple stream of elements.
 * <p>
 * This class defines the token types and communicates these values to
 * {@code STParser.g} via {@code STLexer.tokens} file (which must remain
 * consistent).</p>
 */
public class STLexer implements TokenSource {
    public static final char EOF = (char)-1;            // EOF char
    public static final int EOF_TYPE = CharStream.EOF;  // EOF token type


    /** We build {@code STToken} tokens instead of relying on {@link CommonToken}
   *  so we can override {@link #toString()}. It just converts token types to
     *  token names like 23 to {@code "LDELIM"}.
     */
    public static class STToken extends CommonToken {
        public STToken(CharStream input, int type, int start, int stop) {
            super(input, type, DEFAULT_CHANNEL, start, stop);
        }
        public STToken(int type, String text) { super(type, text); }


    @Override
        public String toString() {
            String channelStr = "";
            if ( channel>0 ) {
                channelStr=",channel="+channel;
            }
            String txt = getText();
            if ( txt!=null ) txt = Misc.replaceEscapes(txt);
            else txt = "<no text>";
      String tokenName = null;
      if ( type==EOF_TYPE ) tokenName = "EOF";
      else tokenName = STParser.tokenNames[type];
      return "[@"+getTokenIndex()+","+start+":"+stop+"='"+txt+"',<"+ tokenName +">"+channelStr+","+line+":"+getCharPositionInLine()+"]";
        }
    }


    public static final Token SKIP = new STToken(-1, "<skip>");


    // must follow STLexer.tokens file that STParser.g loads
    public static final int RBRACK=17;
    public static final int LBRACK=16;
    public static final int ELSE=5;
    public static final int ELLIPSIS=11;
    public static final int LCURLY=20;
    public static final int BANG=10;
    public static final int EQUALS=12;
    public static final int TEXT=22;
    public static final int ID=25;
    public static final int SEMI=9;
    public static final int LPAREN=14;
    public static final int IF=4;
    public static final int ELSEIF=6;
    public static final int COLON=13;
    public static final int RPAREN=15;
    public static final int COMMA=18;
    public static final int RCURLY=21;
    public static final int ENDIF=7;
    public static final int RDELIM=24;
    public static final int SUPER=8;
    public static final int DOT=19;
    public static final int LDELIM=23;
    public static final int STRING=26;
  public static final int PIPE=28;
  public static final int OR=29;
  public static final int AND=30;
  public static final int INDENT=31;
    public static final int NEWLINE=32;
    public static final int AT=33;
    public static final int REGION_END=34;
  public static final int TRUE=35;
  public static final int FALSE=36;
  public static final int COMMENT=37;




    /** The char which delimits the start of an expression. */
    char delimiterStartChar = '<';
    /** The char which delimits the end of an expression. */
    char delimiterStopChar = '>';


  /**
   * This keeps track of the current mode of the lexer. Are we inside or
   * outside an ST expression?
   */
    boolean scanningInsideExpr = false;


    /** To be able to properly track the inside/outside mode, we need to
     *  track how deeply nested we are in some templates. Otherwise, we
     *  know whether a <code>'}'</code> and the outermost subtemplate to send this
   *  back to outside mode.
     */
  public int subtemplateDepth = 0; // start out *not* in a {...} subtemplate


  ErrorManager errMgr;


  /** template embedded in a group file? this is the template */
  Token templateToken;


    CharStream input;
  /** current character */
    char c;


    /** When we started token, track initial coordinates so we can properly
     *  build token objects.
     */
    int startCharIndex;
    int startLine;
    int startCharPositionInLine;


    /** Our lexer routines might have to emit more than a single token. We
     *  buffer everything through this list.
     */
    List<Token> tokens = new ArrayList<Token>();


  public STLexer(CharStream input) { this(STGroup.DEFAULT_ERR_MGR, input, null, '<', '>'); }


    public STLexer(ErrorManager errMgr, CharStream input, Token templateToken) {
    this(errMgr, input, templateToken, '<', '>');
  }


  public STLexer(ErrorManager errMgr,
           CharStream input,
           Token templateToken,
           char delimiterStartChar,
           char delimiterStopChar)
  {
    this.errMgr = errMgr;
    this.input = input;
    c = (char)input.LA(1); // prime lookahead
    this.templateToken = templateToken;
    this.delimiterStartChar = delimiterStartChar;
    this.delimiterStopChar = delimiterStopChar;
  }


  @Override
  public Token nextToken() {
    Token t;
    if ( tokens.size()>0 ) { t = tokens.remove(0); }
    else t = _nextToken();
//    System.out.println(t);
    return t;
  }


    /** Consume if {@code x} is next character on the input stream.
   */
    public void match(char x) {
        if ( c != x ) {
      NoViableAltException e = new NoViableAltException("",0,0,input);
      errMgr.lexerError(input.getSourceName(), "expecting '"+x+"', found '"+str(c)+"'", templateToken, e);
    }
    consume();
    }


    protected void consume() {
        input.consume();
        c = (char)input.LA(1);
    }


    public void emit(Token token) { tokens.add(token); }


    public Token _nextToken() {
    //System.out.println("nextToken: c="+(char)c+"@"+input.index());
        while ( true ) { // lets us avoid recursion when skipping stuff
            startCharIndex = input.index();
            startLine = input.getLine();
            startCharPositionInLine = input.getCharPositionInLine();


            if ( c==EOF ) return newToken(EOF_TYPE);
            Token t;
            if ( scanningInsideExpr ) t = inside();
            else t = outside();
            if ( t!=SKIP ) return t;
        }
    }


    protected Token outside() {
        if ( input.getCharPositionInLine()==0 && (c==' '||c=='\t') ) {
            while ( c==' ' || c=='\t' ) consume(); // scarf indent
            if ( c!=EOF ) return newToken(INDENT);
            return newToken(TEXT);
        }
        if ( c==delimiterStartChar ) {
            consume();
            if ( c=='!' ) return COMMENT();
            if ( c=='\\' ) return ESCAPE(); // <\\> <\uFFFF> <\n> etc...
            scanningInsideExpr = true;
            return newToken(LDELIM);
        }
        if ( c=='\r' ) { consume(); consume(); return newToken(NEWLINE); } // \r\n -> \n
        if ( c=='\n') {  consume(); return newToken(NEWLINE); }
        if ( c=='}' && subtemplateDepth>0 ) {
            scanningInsideExpr = true;
            subtemplateDepth--;
            consume();
            return newTokenFromPreviousChar(RCURLY);
        }
        return mTEXT();
    }


    protected Token inside() {
        while ( true ) {
            switch ( c ) {
                case ' ': case '\t': case '\n': case '\r':
          consume();
          return SKIP;
                case '.' :
          consume();
          if ( input.LA(1)=='.' && input.LA(2)=='.' ) {
            consume();
            match('.');
            return newToken(ELLIPSIS);
          }
          return newToken(DOT);
                case ',' : consume(); return newToken(COMMA);
        case ':' : consume(); return newToken(COLON);
        case ';' : consume(); return newToken(SEMI);
                case '(' : consume(); return newToken(LPAREN);
                case ')' : consume(); return newToken(RPAREN);
                case '[' : consume(); return newToken(LBRACK);
                case ']' : consume(); return newToken(RBRACK);
        case '=' : consume(); return newToken(EQUALS);
                case '!' : consume(); return newToken(BANG);
                case '@' :
                    consume();
                    if ( c=='e' && input.LA(2)=='n' && input.LA(3)=='d' ) {
                        consume(); consume(); consume();
                        return newToken(REGION_END);
                    }
                    return newToken(AT);
                case '"' : return mSTRING();
                case '&' : consume(); match('&'); return newToken(AND); // &&
                case '|' : consume(); match('|'); return newToken(OR); // ||
        case '{' : return subTemplate();
        default:
          if ( c==delimiterStopChar ) {
            consume();
            scanningInsideExpr =false;
            return newToken(RDELIM);
          }
                    if ( isIDStartLetter(c) ) {
            Token id = mID();
            String name = id.getText();
            if ( name.equals("if") ) return newToken(IF);
            else if ( name.equals("endif") ) return newToken(ENDIF);
            else if ( name.equals("else") ) return newToken(ELSE);
            else if ( name.equals("elseif") ) return newToken(ELSEIF);
            else if ( name.equals("super") ) return newToken(SUPER);
            else if ( name.equals("true") ) return newToken(TRUE);
            else if ( name.equals("false") ) return newToken(FALSE);
            return id;
          }
          RecognitionException re =
            new NoViableAltException("",0,0,input);
                    re.line = startLine;
                    re.charPositionInLine = startCharPositionInLine;
          errMgr.lexerError(input.getSourceName(), "invalid character '"+str(c)+"'", templateToken, re);
          if (c==EOF) {
            return newToken(EOF_TYPE);
          }
          consume();
            }
        }
    }


    Token subTemplate() {
        // look for "{ args ID (',' ID)* '|' ..."
    subtemplateDepth++;
        int m = input.mark();
        int curlyStartChar = startCharIndex;
        int curlyLine = startLine;
        int curlyPos = startCharPositionInLine;
        List<Token> argTokens = new ArrayList<Token>();
        consume();
    Token curly = newTokenFromPreviousChar(LCURLY);
        WS();
        argTokens.add( mID() );
        WS();
        while ( c==',' ) {
      consume();
            argTokens.add( newTokenFromPreviousChar(COMMA) );
            WS();
            argTokens.add( mID() );
            WS();
        }
        WS();
        if ( c=='|' ) {
      consume();
            argTokens.add( newTokenFromPreviousChar(PIPE) );
            if ( isWS(c) ) consume(); // ignore a single whitespace after |
            //System.out.println("matched args: "+argTokens);
            for (Token t : argTokens) emit(t);
      input.release(m);
      scanningInsideExpr = false;
      startCharIndex = curlyStartChar; // reset state
      startLine = curlyLine;
      startCharPositionInLine = curlyPos;
      return curly;
    }
    input.rewind(m);
    startCharIndex = curlyStartChar; // reset state
    startLine = curlyLine;
        startCharPositionInLine = curlyPos;
    consume();
    scanningInsideExpr = false;
        return curly;
    }


    Token ESCAPE() {
    startCharIndex = input.index();
    startCharPositionInLine = input.getCharPositionInLine();
    consume(); // kill \\
    if ( c=='u') return UNICODE();
    String text = null;
        switch ( c ) {
            case '\\' : LINEBREAK(); return SKIP;
      case 'n'  : text = "\n"; break;
      case 't'  : text = "\t"; break;
      case ' '  : text = " "; break;
            default :
                NoViableAltException e = new NoViableAltException("",0,0,input);
                errMgr.lexerError(input.getSourceName(), "invalid escaped char: '"+str(c)+"'", templateToken, e);
        consume();
        match(delimiterStopChar);
        return SKIP;
        }
        consume();
    Token t = newToken(TEXT, text, input.getCharPositionInLine()-2);
        match(delimiterStopChar);
        return t;
    }


    Token UNICODE() {
        consume();
        char[] chars = new char[4];
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
            errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[0] = c;
        consume();
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
      errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[1] = c;
        consume();
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
      errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[2] = c;
        consume();
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
      errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[3] = c;
        // ESCAPE kills >
        char uc = (char)Integer.parseInt(new String(chars), 16);
        Token t = newToken(TEXT, String.valueOf(uc), input.getCharPositionInLine()-6);
    consume();
    match(delimiterStopChar);
    return t;
    }


    Token mTEXT() {
    boolean modifiedText = false;
        StringBuilder buf = new StringBuilder();
        while ( c != EOF && c != delimiterStartChar ) {
      if ( c=='\r' || c=='\n') break;
      if ( c=='}' && subtemplateDepth>0 ) break;
            if ( c=='\\' ) {
                if ( input.LA(2)=='\\' ) { // convert \\ to \
                    consume(); consume(); buf.append('\\');
                    modifiedText = true;
                    continue;
                }
                if ( input.LA(2)==delimiterStartChar ||
           input.LA(2)=='}' )
        {
                    modifiedText = true;
                    consume(); // toss out \ char
                    buf.append(c); consume();
                }
                else {
                    buf.append(c);
                    consume();
                }
                continue;
            }
            buf.append(c);
            consume();
        }
        if ( modifiedText )  return newToken(TEXT, buf.toString());
        else return newToken(TEXT);
    }


    /** <pre>
   *  ID  : ('a'..'z'|'A'..'Z'|'_'|'/')
   *        ('a'..'z'|'A'..'Z'|'0'..'9'|'_'|'/')*
   *      ;
   *  </pre>
   */
    Token mID() {
        // called from subTemplate; so keep resetting position during speculation
        startCharIndex = input.index();
        startLine = input.getLine();
        startCharPositionInLine = input.getCharPositionInLine();
        consume();
        while ( isIDLetter(c) ) {
            consume();
        }
        return newToken(ID);
    }


    /** <pre>
   *  STRING : '"'
   *           (   '\\' '"'
   *           |   '\\' ~'"'
   *           |   ~('\\'|'"')
   *           )*
   *           '"'
   *         ;
   * </pre>
   */
    Token mSTRING() {
      //{setText(getText().substring(1, getText().length()-1));}
        boolean sawEscape = false;
        StringBuilder buf = new StringBuilder();
        buf.append(c); consume();
        while ( c != '"' ) {
            if ( c=='\\' ) {
                sawEscape = true;
                consume();
        switch ( c ) {
          case 'n' : buf.append('\n'); break;
          case 'r' : buf.append('\r'); break;
          case 't' : buf.append('\t'); break;
                  default : buf.append(c); break;
        }
        consume();
                continue;
            }
            buf.append(c);
            consume();
      if ( c==EOF ) {
        RecognitionException re =
          new MismatchedTokenException((int)'"', input);
        re.line = input.getLine();
        re.charPositionInLine = input.getCharPositionInLine();
        errMgr.lexerError(input.getSourceName(), "EOF in string", templateToken, re);
        break;
      }
        }
        buf.append(c);
        consume();
        if ( sawEscape ) return newToken(STRING, buf.toString());
        else return newToken(STRING);
    }


    void WS() {
        while ( c==' ' || c=='\t' || c=='\n' || c=='\r' ) consume();
    }


    Token COMMENT() {
        match('!');
        while ( !(c=='!' && input.LA(2)==delimiterStopChar) ) {
      if (c==EOF) {
        RecognitionException re =
          new MismatchedTokenException((int)'!', input);
        re.line = input.getLine();
        re.charPositionInLine = input.getCharPositionInLine();
        errMgr.lexerError(input.getSourceName(), "Nonterminated comment starting at " +
          startLine+":"+startCharPositionInLine+": '!"+
          delimiterStopChar+"' missing", templateToken, re);
        break;
      }
      consume();
    }
        consume(); consume(); // grab !>
    return newToken(COMMENT);
    }


    void LINEBREAK() {
        match('\\'); // only kill 2nd \ as ESCAPE() kills first one
        match(delimiterStopChar);
        while ( c==' ' || c=='\t' ) consume(); // scarf WS after <\\>
    if ( c==EOF ) {
      RecognitionException re = new RecognitionException(input);
      re.line = input.getLine();
      re.charPositionInLine = input.getCharPositionInLine();
      errMgr.lexerError(input.getSourceName(), "Missing newline after newline escape <\\\\>",
                      templateToken, re);
      return;
    }
    if ( c=='\r' ) consume();
        match('\n');
        while ( c==' ' || c=='\t' ) consume(); // scarf any indent
    }


    public static boolean isIDStartLetter(char c) { return isIDLetter(c); }
  public static boolean isIDLetter(char c) { return c>='a'&&c<='z' || c>='A'&&c<='Z' || c>='0'&&c<='9' || c=='_' || c=='/'; }
    public static boolean isWS(char c) { return c==' ' || c=='\t' || c=='\n' || c=='\r'; }
    public static boolean isUnicodeLetter(char c) { return c>='a'&&c<='f' || c>='A'&&c<='F' || c>='0'&&c<='9'; }


    public Token newToken(int ttype) {
        STToken t = new STToken(input, ttype, startCharIndex, input.index()-1);
        t.setLine(startLine);
        t.setCharPositionInLine(startCharPositionInLine);
    return t;
  }


    public Token newTokenFromPreviousChar(int ttype) {
        STToken t = new STToken(input, ttype, input.index()-1, input.index()-1);
        t.setLine(input.getLine());
        t.setCharPositionInLine(input.getCharPositionInLine()-1);
        return t;
    }


    public Token newToken(int ttype, String text, int pos) {
        STToken t = new STToken(ttype, text);
    t.setStartIndex(startCharIndex);
    t.setStopIndex(input.index()-1);
        t.setLine(input.getLine());
        t.setCharPositionInLine(pos);
        return t;
    }


  public Token newToken(int ttype, String text) {
    STToken t = new STToken(ttype, text);
        t.setStartIndex(startCharIndex);
        t.setStopIndex(input.index()-1);
    t.setLine(startLine);
    t.setCharPositionInLine(startCharPositionInLine);
    return t;
  }


//    public String getErrorHeader() {
//        return startLine+":"+startCharPositionInLine;
//    }
//
  @Override
    public String getSourceName() {
        return "no idea";
    }


  public static String str(int c) {
    if ( c==EOF ) return "<EOF>";
    return String.valueOf((char)c);
  }
}
Source Code of org.stringtemplate.v4.compiler.STLexer$STToken

Related Classes of org.stringtemplate.v4.compiler.STLexer$STToken