Source Code of fri.patterns.interpreter.parsergenerator.lexer.LexerBuilder

package fri.patterns.interpreter.parsergenerator.lexer;


import java.util.*;
import java.io.IOException;
import fri.patterns.interpreter.parsergenerator.Lexer;
import fri.patterns.interpreter.parsergenerator.Token;
import fri.patterns.interpreter.parsergenerator.syntax.*;
import fri.patterns.interpreter.parsergenerator.syntax.builder.SyntaxSeparation;


/**
  Generates a Lexer from a Syntax. The Syntax can contain also parser rules.
  These will be retrievable (without the removed lexer rules) after build by
  calling <i>lexer.getParserSyntax()</i>.
  <p>
  The syntax rules may not contain '(', ')', '*', '+' or '?', but they may
  contain character set symbols  like ".." (set definitions) and "-" (intersections).
  <p>
  The syntax may contain identifiers enclosed within `backquotes`.
  This marks predefined lexer rules defined in <i>StandardLexerRules</i>.
  That class contains default rules for numbers, identifiers, stringdefinitions,
  characterdefinitions and many other (e.g. for XML), which can be used to build
  lexers.<br>
  <b>CAUTION:</b> Lexer and parser rules have the same namespace, you can not define
  <pre>
    identifier ::= `identifier`;  // wrong!
  </pre>
  Nevertheless you need not to care about the names silently imported from
  <i>StandardLexerRules</i>, they will not reduce the parser syntax namespace,
  only the toplevel rules will.
  <p>
  The syntax may contain (case-sensitive) these nonterminals:
  <ul>
    <li>token</li>
    <li>ignored</li>
  </ul>
  These are lexer-reserved identifiers and can be used to mark top level lexer
  rules (tokens). When <b>token</b> is used, the builder does not try to recognize any
  rule as lexer rule, so this must be good modeled. Be careful: you can read away
  comments only by using <b>ignored</b>. But you can define <b>ignored</b> without <b>token</b>,
  then nevertheless the builder tries to recognize lexer rules.<br>
  When the <b>token</b> marker is not used, the builder tries to separate lexer from
  parser rules.
  <p>
  Example:
  <pre>
    token ::= `identifier` ;  // using StandardLexerRules
    ignored ::= `spaces` ;
    ignored ::= `newline` ;
    ignored ::= comment ;
    comment ::= "//" char_minus_newline_list_opt ;
    char_minus_newline ::= chars - newline;
    char_minus_newline_list ::= char_minus_newline_list char_minus_newline;
    char_minus_newline_list ::= char_minus_newline ;
    char_minus_newline_list_opt ::= char_minus_newline_list;
    char_minus_newline_list_opt ::= ;  // nothing
  </pre>
  Mind that the builder input can not be a text file, it must be wrapped into <i>Syntax</i>.
  Use syntax builder to convert a text into a <i>Syntax</i> object.
  <p>
  Java code fragment:
  <pre>
    SyntaxSeparation separation = new SyntaxSeparation(new Syntax(myRules));
    LexerBuilder builder = new LexerBuilder(separation.getLexerSyntax(), separation.getIgnoredSymbols());
    Lexer lexer = builder.getLexer();
    // when using the lexer standalone (without Parser), you must put the token terminal symbols into it now:
    lexer.setTerminals(separation.getTokenSymbols());
  </pre>
  
  @see fri.patterns.interpreter.parsergenerator.syntax.builder.SyntaxSeparation
  @see fri.patterns.interpreter.parsergenerator.lexer.StandardLexerRules
  @author (c) 2002, Fritz Ritzberger
*/


public class LexerBuilder
{
  protected Map charConsumers;
  protected List ignoredSymbols;
  public static boolean DEBUG;  // defaults to false


  /**
    Creates a LexerBuilder (from lexer rules) that provides a Lexer.
    @param lexerSyntax lexer rule (without token and ignored, use SyntaxSeparation for that)
    @param ignoredSymbols list of ignored symbols, NOT enclosed in backquotes!
  */
  public LexerBuilder(Syntax lexerSyntax, List ignoredSymbols)
    throws LexerException, SyntaxException
  {
    this.ignoredSymbols = ignoredSymbols;
    build(lexerSyntax);
  }




  /** Returns the built Lexer. */
  public Lexer getLexer()  {
    return new LexerImpl(ignoredSymbols, charConsumers);
  }
    
  /** Returns the built Lexer, loaded with passed input (file, stream, string, ...). */
  public Lexer getLexer(Object input)
    throws IOException
  {
    Lexer lexer = getLexer();
    lexer.setInput(input);
    return lexer;
  }




  private void build(Syntax lexerSyntax)
    throws LexerException, SyntaxException
  {
    SyntaxSeparation.IntArray deleteIndexes = new SyntaxSeparation.IntArray(lexerSyntax.size());
    if (DEBUG)
      System.err.println("Processing lexer rules: \n"+lexerSyntax);


    // resolve scanner rules to Consumers and put it into a hashtable
    this.charConsumers = new Hashtable(lexerSyntax.size());
    for (int i = 0; i < lexerSyntax.size(); i++)
      translateLexerRule(lexerSyntax.getRule(i), i, deleteIndexes);
    deleteIndexes.removeIndexesFrom(lexerSyntax);
    
    // check for unresolved repeatable and nullable rules and delete them from lexer syntax
    for (int i = 0; i < lexerSyntax.size(); i++)  {
      Rule rule = lexerSyntax.getRule(i);
      String nonterm = rule.getNonterminal();
      if (checkNullableRule(nonterm, rule, i, deleteIndexes) == false)
        if (checkRepeatableRule(nonterm, rule, i, deleteIndexes) == false)
          throw new LexerException("Found no character consumer for nullable or repeatable rule "+rule);
    }
    deleteIndexes.removeIndexesFrom(lexerSyntax);
    
    if (lexerSyntax.size() > 0)  {  // not all rules have been resolved to character consumers
      throw new LexerException("Could not process rules in lexer syntax: "+lexerSyntax);
    }


    // resolve all symbolic consumer references after all consumers have been created
    Map done = new Hashtable();  // beware of recursion
    for (Iterator it = charConsumers.entrySet().iterator(); it.hasNext(); )  {
      Consumer cc = (Consumer) ((Map.Entry)it.next()).getValue();
      cc.resolveConsumerReferences(charConsumers, done);
    }
  }




  private void translateLexerRule(Rule rule, int index, SyntaxSeparation.IntArray deleteIndexes)
    throws LexerException
  {
    String nonterm = rule.getNonterminal();
    if (rule.rightSize() <= 0 || rule.getRightSymbol(0).equals(nonterm))  // nullable rules and left recursive rules will be resolved later
      return;
    
    //System.err.println("translating lexer rule: "+rule);


    // ExtendedGrammar should have resolved all parenthesis expressions and wildcards.
    // We take away rules that are:
    // - single character position definitions like
    //     nonterm ::= '0' .. '9'
    //     nonterm ::= 'a' .. 'z' - 'm' .. 'n'
    //     nonterm ::= something - "string"  // "something" must be among scanner rules
    // - single string terminal definitions like
    //     nonterm ::= "string"
    //     nonterm ::= 'c' 'd' 'e' "fgh"  // do concatenation
    // - scanner nonterminals concatenations like
    //     nonterm ::= something1 something2  // "somethingN" must be among scanner rules
    // - scanner nonterminals concatenations like
    //     nonterm ::= "string"


    int CONCATENATION = 0, SET = 1, SUBTRACTION = 2;
    int state = CONCATENATION;
    boolean intersectionHappened = false;
    Consumer consumer = new Consumer(rule);  // master consumer
    Consumer currentConsumer = new Consumer();
    Consumer setConsumer = currentConsumer;  // will host set definitions
    consumer.append(currentConsumer);  // will be resolved when trivial
    
    for (int i = 0; i < rule.rightSize(); i++)  {  // loop all symbols on right side
      String sym = rule.getRightSymbol(i);
      
      if (sym.equals(Token.BUTNOT))  {
        if (i == 0 || state != CONCATENATION)
          throw new LexerException("Missing symbol to subtract from: "+rule);
        state = SUBTRACTION;
      }
      else
      if (sym.equals(Token.UPTO))  {
        if (i == 0 || state != CONCATENATION)
          throw new LexerException("Missing lower limit of set: "+rule);
        state = SET;
      }
      else  {
        String convertedSym = convertSymbol(sym);  // remove quotes or convert number to char
        boolean isNonterm = convertedSym.equals(sym);
        if (isNonterm && state == SET)
          throw new LexerException("Can not append nonterminal to set: "+rule);


        boolean setWillHappen = rule.rightSize() > i + 1 && rule.getRightSymbol(i + 1).equals(Token.UPTO);  // next symbol will be ".."


        if (state == SET)  {
          setConsumer.appendSet(convertedSym);
          setConsumer = currentConsumer;  // reset if intersection happened
        }
        else
        if (state == SUBTRACTION)  {
          intersectionHappened = true;
          if (isNonterm)
            if (setWillHappen)
              throw new LexerException("Nonterminal can not open set after subtraction: "+rule);
            else
              currentConsumer.subtract(new Consumer.Reference(sym));
          else
            if (setWillHappen)
              currentConsumer.subtract(setConsumer = new Consumer(convertedSym));
            else
              currentConsumer.subtract(new Consumer(convertedSym));
        }
        else
        if (state == CONCATENATION)  {
          if (intersectionHappened)  {  // start new consumer
            intersectionHappened = false;
            currentConsumer = new Consumer();
            consumer.append(currentConsumer);
          }
          
          if (isNonterm)
            if (setWillHappen)
              throw new LexerException("Nonterminal can not open set in concatenation: "+rule);
            else
              currentConsumer.append(new Consumer.Reference(sym));
          else
            currentConsumer.append(convertedSym);  // a following set will be recognized by consumer
        }


        state = CONCATENATION;  // reset to normal state
        
      }  // end switch current symbol
    }  // end for right side of rule


    putCharConsumer(nonterm, consumer.optimize());
    deleteIndexes.add(index);
  }




  private void putCharConsumer(String key, Consumer consumer)  {
    //System.err.println("putting character consumer for "+key);
    Object o = charConsumers.get(key);  // test if existing
    
    if (o == null)  {  // not in list
      charConsumers.put(key, consumer);
    }
    else  {
      ConsumerAlternatives ca;


      if (o instanceof ConsumerAlternatives == false)  {
        ca = new ConsumerAlternatives((Consumer)o);
        charConsumers.put(key, ca);  // replace consumer
      }
      else  {
        ca = (ConsumerAlternatives)o;
      }
      
      ca.addAlternate(consumer);  // add a new alternative
    }
  }






  private boolean checkNullableRule(String nonterm, Rule rule, int index, SyntaxSeparation.IntArray deleteIndexes)  {
    // We take away rules that are optional nonterminals like
    //     nonterm ::= something  // "nonterm" is already among scanner rules
    //     nonterm ::= /*nothing*/  // this is the rule to remove now


    if (rule.rightSize() <= 0)  {
      Object o = charConsumers.get(nonterm);
      ((Consumer)o).setNullable();
      deleteIndexes.add(index);
      return true;  // do not explore empty rule, return "found nullable"
    }
    return false;
  }




  private boolean checkRepeatableRule(String nonterm, Rule rule, int index, SyntaxSeparation.IntArray deleteIndexes)  {
    // We take away rules that are left recursive like
    //     nonterm ::= nonterm something    // this is the rule to remove now
    //     nonterm ::= something  // "nonterm" must be already among scanner rules


    // check for nonterm in hashtable, set it repeatable if found
    if (rule.rightSize() >= 2 && rule.getRightSymbol(0).equals(nonterm))  {  // left recursive
      Consumer cc = (Consumer) charConsumers.get(nonterm);
      if (cc.matchesRepeatableRule(rule))  {  // check if rest on the right is same
        cc.setRepeatable();
        deleteIndexes.add(index);
        return true;
      }
    }
    return false;
  }


  
  /**
    Converts a character or string definition to its processable form.
    This implementation must be according to "bnf_chardef" in <i>StandardLexerRules</i>.
    <ul>
      <li>0xFFFF   hexadecimal, convert to character</li>
      <li>0777     octal, convert to character</li>
      <li>12345    decimal, convert to character</li>
      <li>'c'      character, remove quotes</li>
      <li>'\n', "\n"      escaped character, decode</li>
      <li>"string"      string, remove quotes</li>
      <li>nonterminal - will stay unchanged</li>
    </ul>
  */
  private String convertSymbol(String sym)  {
    if (sym.charAt(0) == '\'' || sym.charAt(0) == '"')  {
      String s = sym.substring(1, sym.length() - 1);
      if (s.length() <= 0)
        throw new IllegalArgumentException("Empty character or string definition: "+sym);


      StringBuffer sb = new StringBuffer(s.length());  // convert escape sequences to their real meaning
      for (int i = 0; i < s.length(); i++)  {
        char c = s.charAt(i);
        if (c == '\\')  {
          char c1 = s.length() > i + 1 ? s.charAt(i + 1) : 0;
          switch (c1)  {
            case 'n': sb.append('\n'); i++; break;
            case 'r': sb.append('\r'); i++; break;
            case 't': sb.append('\t'); i++; break;
            case 'f': sb.append('\f'); i++; break;
            case 'b': sb.append('\b'); i++; break;
            case '\'': sb.append('\''); i++; break;
            case '"': sb.append('"'); i++; break;
            case '\\': sb.append('\\'); i++; break;
            default: sb.append(c); break;
          }
        }
        else  {
          sb.append(c);
        }
      }
      return sb.toString();
    }
    else  {  // must be starting with digit or be a nonterminal
      char c;
      if (sym.startsWith("0x") || sym.startsWith("0X"))  // hexadecimal number
        c = (char) Integer.valueOf(sym.substring(2), 16).intValue();
      else
      if (sym.startsWith("0"))  // octal number
        c = (char) Integer.valueOf(sym.substring(1), 8).intValue();
      else
      if (Character.isDigit(sym.charAt(0)))
        c = (char) Integer.valueOf(sym).intValue();  // will throw NumberFormatException when not number
      else
        return sym;  // is a nonterminal
      
      return new String(new char [] { c });
    }
  }
  
}
Source Code of fri.patterns.interpreter.parsergenerator.lexer.LexerBuilder

Related Classes of fri.patterns.interpreter.parsergenerator.lexer.LexerBuilder