Source Code of org.antlr.v4.automata.LexerATNFactory

/*
 * [The "BSD license"]
 *  Copyright (c) 2012 Terence Parr
 *  Copyright (c) 2012 Sam Harwell
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  3. The name of the author may not be used to endorse or promote products
 *     derived from this software without specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


package org.antlr.v4.automata;


import org.antlr.runtime.CommonToken;
import org.antlr.runtime.Token;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.misc.CharSupport;
import org.antlr.v4.parse.ANTLRParser;
import org.antlr.v4.runtime.IntStream;
import org.antlr.v4.runtime.Lexer;
import org.antlr.v4.runtime.atn.ATN;
import org.antlr.v4.runtime.atn.ATNState;
import org.antlr.v4.runtime.atn.ActionTransition;
import org.antlr.v4.runtime.atn.AtomTransition;
import org.antlr.v4.runtime.atn.LexerAction;
import org.antlr.v4.runtime.atn.LexerChannelAction;
import org.antlr.v4.runtime.atn.LexerCustomAction;
import org.antlr.v4.runtime.atn.LexerModeAction;
import org.antlr.v4.runtime.atn.LexerMoreAction;
import org.antlr.v4.runtime.atn.LexerPopModeAction;
import org.antlr.v4.runtime.atn.LexerPushModeAction;
import org.antlr.v4.runtime.atn.LexerSkipAction;
import org.antlr.v4.runtime.atn.LexerTypeAction;
import org.antlr.v4.runtime.atn.NotSetTransition;
import org.antlr.v4.runtime.atn.RangeTransition;
import org.antlr.v4.runtime.atn.RuleStartState;
import org.antlr.v4.runtime.atn.SetTransition;
import org.antlr.v4.runtime.atn.TokensStartState;
import org.antlr.v4.runtime.atn.Transition;
import org.antlr.v4.runtime.misc.Interval;
import org.antlr.v4.runtime.misc.IntervalSet;
import org.antlr.v4.runtime.misc.NotNull;
import org.antlr.v4.runtime.misc.Nullable;
import org.antlr.v4.tool.ErrorType;
import org.antlr.v4.tool.LexerGrammar;
import org.antlr.v4.tool.Rule;
import org.antlr.v4.tool.ast.ActionAST;
import org.antlr.v4.tool.ast.GrammarAST;
import org.antlr.v4.tool.ast.TerminalAST;
import org.stringtemplate.v4.ST;
import org.stringtemplate.v4.STGroup;


import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;


public class LexerATNFactory extends ParserATNFactory {
  public STGroup codegenTemplates;


  /**
   * Provides a map of names of predefined constants which are likely to
   * appear as the argument for lexer commands. These names would be resolved
   * by the Java compiler for lexer commands that are translated to embedded
   * actions, but are required during code generation for creating
   * {@link LexerAction} instances that are usable by a lexer interpreter.
   */
  protected static final Map<String, Integer> COMMON_CONSTANTS = new HashMap<String, Integer>();
  static {
    COMMON_CONSTANTS.put("HIDDEN", Lexer.HIDDEN);
    COMMON_CONSTANTS.put("DEFAULT_TOKEN_CHANNEL", Lexer.DEFAULT_TOKEN_CHANNEL);
    COMMON_CONSTANTS.put("DEFAULT_MODE", Lexer.DEFAULT_MODE);
    COMMON_CONSTANTS.put("SKIP", Lexer.SKIP);
    COMMON_CONSTANTS.put("MORE", Lexer.MORE);
    COMMON_CONSTANTS.put("EOF", Lexer.EOF);
    COMMON_CONSTANTS.put("MAX_CHAR_VALUE", Lexer.MAX_CHAR_VALUE);
    COMMON_CONSTANTS.put("MIN_CHAR_VALUE", Lexer.MIN_CHAR_VALUE);
  }


  /**
   * Maps from an action index to a {@link LexerAction} object.
   */
  protected Map<Integer, LexerAction> indexToActionMap = new HashMap<Integer, LexerAction>();
  /**
   * Maps from a {@link LexerAction} object to the action index.
   */
  protected Map<LexerAction, Integer> actionToIndexMap = new HashMap<LexerAction, Integer>();


  public LexerATNFactory(LexerGrammar g) {
    super(g);
    // use codegen to get correct language templates for lexer commands
    String language = g.getOptionString("language");
    CodeGenerator gen = new CodeGenerator(g.tool, null, language);
    codegenTemplates = gen.getTemplates();
  }


  @Override
  public ATN createATN() {
    // BUILD ALL START STATES (ONE PER MODE)
    Set<String> modes = ((LexerGrammar) g).modes.keySet();
    for (String modeName : modes) {
      // create s0, start state; implied Tokens rule node
      TokensStartState startState =
        newState(TokensStartState.class, null);
      atn.modeNameToStartState.put(modeName, startState);
      atn.modeToStartState.add(startState);
      atn.defineDecisionState(startState);
    }


    // INIT ACTION, RULE->TOKEN_TYPE MAP
    atn.ruleToTokenType = new int[g.rules.size()];
    for (Rule r : g.rules.values()) {
      atn.ruleToTokenType[r.index] = g.getTokenType(r.name);
    }


    // CREATE ATN FOR EACH RULE
    _createATN(g.rules.values());


    atn.lexerActions = new LexerAction[indexToActionMap.size()];
    for (Map.Entry<Integer, LexerAction> entry : indexToActionMap.entrySet()) {
      atn.lexerActions[entry.getKey()] = entry.getValue();
    }


    // LINK MODE START STATE TO EACH TOKEN RULE
    for (String modeName : modes) {
      List<Rule> rules = ((LexerGrammar)g).modes.get(modeName);
      TokensStartState startState = atn.modeNameToStartState.get(modeName);
      for (Rule r : rules) {
        if ( !r.isFragment() ) {
          RuleStartState s = atn.ruleToStartState[r.index];
          epsilon(startState, s);
        }
      }
    }


    ATNOptimizer.optimize(g, atn);
    return atn;
  }


  @Override
  public Handle action(ActionAST action) {
    int ruleIndex = currentRule.index;
    int actionIndex = g.lexerActions.get(action);
    LexerCustomAction lexerAction = new LexerCustomAction(ruleIndex, actionIndex);
    return action(action, lexerAction);
  }


  protected int getLexerActionIndex(LexerAction lexerAction) {
    Integer lexerActionIndex = actionToIndexMap.get(lexerAction);
    if (lexerActionIndex == null) {
      lexerActionIndex = actionToIndexMap.size();
      actionToIndexMap.put(lexerAction, lexerActionIndex);
      indexToActionMap.put(lexerActionIndex, lexerAction);
    }


    return lexerActionIndex;
  }


  @Override
  public Handle action(String action) {
    if (action.trim().isEmpty()) {
      ATNState left = newState(null);
      ATNState right = newState(null);
      epsilon(left, right);
      return new Handle(left, right);
    }


    // define action AST for this rule as if we had found in grammar
        ActionAST ast =  new ActionAST(new CommonToken(ANTLRParser.ACTION, action));
    currentRule.defineActionInAlt(currentOuterAlt, ast);
    return action(ast);
  }


  protected Handle action(GrammarAST node, LexerAction lexerAction) {
    ATNState left = newState(node);
    ATNState right = newState(node);
    boolean isCtxDependent = false;
    int lexerActionIndex = getLexerActionIndex(lexerAction);
    ActionTransition a =
      new ActionTransition(right, currentRule.index, lexerActionIndex, isCtxDependent);
    left.addTransition(a);
    node.atnState = left;
    Handle h = new Handle(left, right);
    return h;
  }


  @Override
  public Handle lexerAltCommands(Handle alt, Handle cmds) {
    Handle h = new Handle(alt.left, cmds.right);
    epsilon(alt.right, cmds.left);
    return h;
  }


  @Override
  public Handle lexerCallCommand(GrammarAST ID, GrammarAST arg) {
    LexerAction lexerAction = createLexerAction(ID, arg);
    if (lexerAction != null) {
      return action(ID, lexerAction);
    }


    // fall back to standard action generation for the command
    ST cmdST = codegenTemplates.getInstanceOf("Lexer" +
                          CharSupport.capitalize(ID.getText())+
                          "Command");
    if (cmdST == null) {
      g.tool.errMgr.grammarError(ErrorType.INVALID_LEXER_COMMAND, g.fileName, ID.token, ID.getText());
      return epsilon(ID);
    }


    if (cmdST.impl.formalArguments == null || !cmdST.impl.formalArguments.containsKey("arg")) {
      g.tool.errMgr.grammarError(ErrorType.UNWANTED_LEXER_COMMAND_ARGUMENT, g.fileName, ID.token, ID.getText());
      return epsilon(ID);
    }


    cmdST.add("arg", arg.getText());
    return action(cmdST.render());
  }


  @Override
  public Handle lexerCommand(GrammarAST ID) {
    LexerAction lexerAction = createLexerAction(ID, null);
    if (lexerAction != null) {
      return action(ID, lexerAction);
    }


    // fall back to standard action generation for the command
    ST cmdST = codegenTemplates.getInstanceOf("Lexer" +
                          CharSupport.capitalize(ID.getText())+
                          "Command");
    if (cmdST == null) {
      g.tool.errMgr.grammarError(ErrorType.INVALID_LEXER_COMMAND, g.fileName, ID.token, ID.getText());
      return epsilon(ID);
    }


    if (cmdST.impl.formalArguments != null && cmdST.impl.formalArguments.containsKey("arg")) {
      g.tool.errMgr.grammarError(ErrorType.MISSING_LEXER_COMMAND_ARGUMENT, g.fileName, ID.token, ID.getText());
      return epsilon(ID);
    }


    return action(cmdST.render());
  }


  @Override
  public Handle range(GrammarAST a, GrammarAST b) {
    ATNState left = newState(a);
    ATNState right = newState(b);
    int t1 = CharSupport.getCharValueFromGrammarCharLiteral(a.getText());
    int t2 = CharSupport.getCharValueFromGrammarCharLiteral(b.getText());
    left.addTransition(new  RangeTransition(right, t1, t2));
    a.atnState = left;
    b.atnState = left;
    return new Handle(left, right);
  }


  @Override
  public Handle set(GrammarAST associatedAST, List<GrammarAST> alts, boolean invert) {
    ATNState left = newState(associatedAST);
    ATNState right = newState(associatedAST);
    IntervalSet set = new IntervalSet();
    for (GrammarAST t : alts) {
      if ( t.getType()==ANTLRParser.RANGE ) {
        int a = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(0).getText());
        int b = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(1).getText());
        set.add(a, b);
      }
      else if ( t.getType()==ANTLRParser.LEXER_CHAR_SET ) {
        set.addAll(getSetFromCharSetLiteral(t));
      }
      else if ( t.getType()==ANTLRParser.STRING_LITERAL ) {
        int c = CharSupport.getCharValueFromGrammarCharLiteral(t.getText());
        if ( c != -1 ) {
          set.add(c);
        }
        else {
          g.tool.errMgr.grammarError(ErrorType.INVALID_LITERAL_IN_LEXER_SET,
                         g.fileName, t.getToken(), t.getText());


        }
      }
      else if ( t.getType()==ANTLRParser.TOKEN_REF ) {
        g.tool.errMgr.grammarError(ErrorType.UNSUPPORTED_REFERENCE_IN_LEXER_SET,
                       g.fileName, t.getToken(), t.getText());
      }
    }
    if ( invert ) {
      left.addTransition(new NotSetTransition(right, set));
    }
    else {
      Transition transition;
      if (set.getIntervals().size() == 1) {
        Interval interval = set.getIntervals().get(0);
        transition = new RangeTransition(right, interval.a, interval.b);
      } else {
        transition = new SetTransition(right, set);
      }


      left.addTransition(transition);
    }
    associatedAST.atnState = left;
    return new Handle(left, right);
  }


  /** For a lexer, a string is a sequence of char to match.  That is,
   *  "fog" is treated as 'f' 'o' 'g' not as a single transition in
   *  the DFA.  Machine== o-'f'->o-'o'->o-'g'->o and has n+1 states
   *  for n characters.
   */
  @Override
  public Handle stringLiteral(TerminalAST stringLiteralAST) {
    String chars = stringLiteralAST.getText();
    chars = CharSupport.getStringFromGrammarStringLiteral(chars);
    int n = chars.length();
    ATNState left = newState(stringLiteralAST);
    ATNState prev = left;
    ATNState right = null;
    for (int i=0; i<n; i++) {
      right = newState(stringLiteralAST);
      prev.addTransition(new AtomTransition(right, chars.charAt(i)));
      prev = right;
    }
    stringLiteralAST.atnState = left;
    return new Handle(left, right);
  }


  /** [Aa\t \u1234a-z\]\-] char sets */
  @Override
  public Handle charSetLiteral(GrammarAST charSetAST) {
    ATNState left = newState(charSetAST);
    ATNState right = newState(charSetAST);
    IntervalSet set = getSetFromCharSetLiteral(charSetAST);
    left.addTransition(new SetTransition(right, set));
    charSetAST.atnState = left;
    return new Handle(left, right);
  }


  public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) {
    String chars = charSetAST.getText();
    chars = chars.substring(1, chars.length()-1);
    String cset = '"'+ chars +'"';
    IntervalSet set = new IntervalSet();


    // unescape all valid escape char like \n, leaving escaped dashes as '\-'
    // so we can avoid seeing them as '-' range ops.
    chars = CharSupport.getStringFromGrammarStringLiteral(cset);
    // now make x-y become set of char
    int n = chars.length();
    for (int i=0; i< n; i++) {
      int c = chars.charAt(i);
      if ( c=='\\' && (i+1)<n && chars.charAt(i+1)=='-' ) { // \-
        set.add('-');
        i++;
      }
      else if ( (i+2)<n && chars.charAt(i+1)=='-' ) { // range x-y
        int x = c;
        int y = chars.charAt(i+2);
        if ( x<=y ) set.add(x,y);
        i+=2;
      }
      else {
        set.add(c);
      }
    }
    return set;
  }


  @Override
  public Handle tokenRef(TerminalAST node) {
    // Ref to EOF in lexer yields char transition on -1
    if ( node.getText().equals("EOF") ) {
      ATNState left = newState(node);
      ATNState right = newState(node);
      left.addTransition(new AtomTransition(right, IntStream.EOF));
      return new Handle(left, right);
    }
    return _ruleRef(node);
  }


  @Nullable
  protected LexerAction createLexerAction(@NotNull GrammarAST ID, @Nullable GrammarAST arg) {
    String command = ID.getText();
    if ("skip".equals(command) && arg == null) {
      return LexerSkipAction.INSTANCE;
    }
    else if ("more".equals(command) && arg == null) {
      return LexerMoreAction.INSTANCE;
    }
    else if ("popMode".equals(command) && arg == null) {
      return LexerPopModeAction.INSTANCE;
    }
    else if ("mode".equals(command) && arg != null) {
      String modeName = arg.getText();
      Integer mode = getConstantValue(modeName, arg.getToken());
      if (mode == null) {
        return null;
      }


      return new LexerModeAction(mode);
    }
    else if ("pushMode".equals(command) && arg != null) {
      String modeName = arg.getText();
      Integer mode = getConstantValue(modeName, arg.getToken());
      if (mode == null) {
        return null;
      }


      return new LexerPushModeAction(mode);
    }
    else if ("type".equals(command) && arg != null) {
      String typeName = arg.getText();
      Integer type = getConstantValue(typeName, arg.getToken());
      if (type == null) {
        return null;
      }


      return new LexerTypeAction(type);
    }
    else if ("channel".equals(command) && arg != null) {
      String channelName = arg.getText();
      Integer channel = getConstantValue(channelName, arg.getToken());
      if (channel == null) {
        return null;
      }


      return new LexerChannelAction(channel);
    }
    else {
      return null;
    }
  }


  @Nullable
  protected Integer getConstantValue(@Nullable String name, @Nullable Token token) {
    if (name == null) {
      return null;
    }


    Integer commonConstant = COMMON_CONSTANTS.get(name);
    if (commonConstant != null) {
      return commonConstant;
    }


    int tokenType = g.getTokenType(name);
    if (tokenType != org.antlr.v4.runtime.Token.INVALID_TYPE) {
      return tokenType;
    }


    int channelValue = g.getChannelValue(name);
    if (channelValue >= org.antlr.v4.runtime.Token.MIN_USER_CHANNEL_VALUE) {
      return channelValue;
    }


    List<String> modeNames = new ArrayList<String>(((LexerGrammar)g).modes.keySet());
    int mode = modeNames.indexOf(name);
    if (mode >= 0) {
      return mode;
    }


    try {
      return Integer.parseInt(name);
    } catch (NumberFormatException ex) {
      g.tool.errMgr.grammarError(ErrorType.UNKNOWN_LEXER_CONSTANT, g.fileName, token, currentRule.name, token != null ? token.getText() : null);
      return null;
    }
  }
}
Source Code of org.antlr.v4.automata.LexerATNFactory

Related Classes of org.antlr.v4.automata.LexerATNFactory