Source Code of lexer.abstractLexer.AbstractLexer

package lexer.abstractLexer;


import java.util.ArrayList;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import lexer.Type;
import lexer.errors.EmptyInputException;
import lexer.errors.LexerException;
import lexer.errors.UnbalancedDescenderException;
import lexer.errors.UnrecognizedCharacterException;
import lipstone.joshua.customStructures.lists.PairedList;


public abstract class AbstractLexer<T extends AbstractToken<? extends Type<?>, T>, U extends Type<?>, V extends AbstractRule<T, ? extends Type<?>, ? extends AbstractAction<T, ? extends Type<?>, ?, X>, ?, X>, W extends AbstractDescender<T, ? extends Type<T>, ? extends AbstractAction<T, ? extends Type<?>, ?, X>, X>, X extends AbstractLexer<T, ? extends Type<?>, V, W, X>> {
  protected final PairedList<String, V> rules;
  protected final PairedList<String, W> descenders;
  protected final ArrayList<U> types;
  protected final ArrayList<Pattern> ignores;
  protected final Stack<DescentSet<T>> descentStack;
  protected boolean ignoreSpace;
  protected String input;
  protected int head;
  protected T current, output;
  private T previous;
  
  /**
   * Basic constructor for a <tt>AbstractLexer</tt>
   */
  public AbstractLexer() {
    rules = new PairedList<>();
    descenders = new PairedList<>();
    types = new ArrayList<>();
    ignores = new ArrayList<Pattern>();
    descentStack = new Stack<>();
    ignoreSpace = true;
    input = "";
    head = 0;
    current = makeNewToken();
    output = current;
    previous = current;
  }
  
  /**
   * Tokenizes a <tt>String</tt>
   * 
   * @param input
   *            the <tt>String</tt> to tokenize
   * @return the <tt>Token</tt>s in the <tt>String</tt>
   * @throws LexerException
   */
  public T lex(String input) throws LexerException {
    return lex(input, 0);
  }
  
  /**
   * Tokenizes a <tt>String</tt>
   * 
   * @param input
   *            the <tt>String</tt> to tokenize
   * @param head
   *            the location at which to start lexing the input
   * @return the <tt>Token</tt>s in the <tt>String</tt>
   * @throws LexerException
   */
  public T lex(String input, int head) throws LexerException {
    descentStack.push(new DescentSet<T>(this.input, this.head, output, previous));
    this.input = input;
    current = makeNewToken();
    output = previous = current;
    this.head = head;
    try {
      while (this.head < input.length())
        if (hasNext()) {
          previous = (T) current.append(getNextToken(true));
          current = previous;
        }
        else
          break;
    }
    catch (LexerException e) {
      descentStack.clear();
      throw e;
    }
    T result = output;
    this.input = descentStack.peek().getInput();
    this.head = descentStack.peek().getHead();
    previous = descentStack.peek().getPrevious();
    output = descentStack.pop().getOutput();
    current = (T) output.getLastToken();
    return result;
  }
  
  /**
   * Gets the next token in the input without stepping this <tt>AbstractLexer</tt> forward.
   * 
   * @return the next token in this <tt>AbstractLexer</tt>'s input
   * @throws LexerException
   *             if no token was found
   */
  public final T getNextToken() throws LexerException {
    return getNextToken(false);
  }
  
  /**
   * Finds the next token in this <tt>AbstractLexer</tt>
   * 
   * @param step
   *            if this is true, it steps this <tt>AbstractLexer</tt>'s read-head forward
   * @return the next token in this <tt>AbstractLexer</tt>'s input
   * @throws LexerException
   *             if no token was found
   */
  public T getNextToken(boolean step) throws LexerException {
    skipIgnores();
    if (head >= input.length())
      throw new EmptyInputException();
    W d = null;
    for (W descender : descenders.getValues())
      if (input.length() - head >= descender.open.length() && input.startsWith(descender.open, head) && (d == null || descender.open.length() > d.open.length()))
        d = descender;
    if (d != null) {
      int close = getEndIndex(input, head, d.open, d.close);
      Matcher m = Pattern.compile("\\Q" + input.substring(head + d.open.length(), close) + "\\E").matcher(input);
      m.find(head);
      int oldHead = head;
      head = close + d.close.length();
      T result = descend(d, m);
      if (!step)
        head = oldHead;
      else
        previous = result;
      return result;
    }
    if (rules.size() > 0) {
      V hit = null;
      Matcher match = null, m;
      for (V rule : rules.getValues()) {
        m = rule.getPattern().matcher(input);
        if (m.find(head) && m.group().length() != 0 && (match == null || match.group().length() < m.group().length())) {
          match = m;
          hit = rule;
        }
      }
      if (hit != null) {
        head += match.group().length();
        T result = hit(hit, match);
        if (!step)
          head -= match.group().length();
        else
          previous = result;
        return result;
      }
    }
    if (input.charAt(head) == ' ') {
      head++;
      return getNextToken(step);
    }
    throw new UnrecognizedCharacterException(input, head);
  }
  
  private final void skipIgnores() {
    while (true) {
      if (ignoreSpace) //This is true if none of patterns start with spaces.
        while (head < input.length() && input.charAt(head) == ' ')
          head++;
      Matcher m = null, check;
      //Get the longest match from the read-head in the ignore patterns
      for (Pattern p : ignores)
        if ((check = p.matcher(input)).find(head) && (m == null || check.end() > m.end()))
          m = check;
      //If nothing matched starting at the read-head, break
      if (m == null)
        break;
      //Otherwise, skip it
      head = m.end();
    }
  }
  
  private final boolean startsWithSpace(String regex) {
    if (regex.startsWith("\\G"))
      regex = regex.substring(2);
    if (regex.charAt(0) == ' ' || (regex.charAt(0) == '\\' && regex.length() > 1 && regex.charAt(1) == ' '))
      return true;
    if (regex.charAt(0) == '[') {
      try {
        return regex.substring(0, getEndIndex(regex, 0, "[", "]")).contains(" ");
      }
      catch (UnbalancedDescenderException e) {/*Cannot occur because the pattern is valid*/}
    }
    if (regex.charAt(0) == '(') {
      try {
        for (String option : regex.substring(0, getEndIndex(regex, 0, "(", ")")).split("(?<!\\\\)\\|"))
          if (startsWithSpace(option))
            return true;
      }
      catch (UnbalancedDescenderException e) {/*Cannot occur because the pattern is valid*/}
      return false;
    }
    return false;
  }
  
  /**
   * @return the output this lexer is currently generating.
   */
  public final T getPreviousToken() {
    return previous;
  }
  
  /**
   * Removes the last generated token and returns it.
   * 
   * @return the output this lexer is currently generating.
   */
  public final T popPreviousToken() {
    T temp = previous;
    previous = current == previous ? (current = previous.getPreviousToken()) : previous.getPreviousToken();
    temp.remove();
    return temp;
  }
  
  /**
   * @return true if there is still untokenized input, otherwise false
   */
  public final boolean hasNext() {
    skipIgnores();
    return head < input.length();
  }
  
  /**
   * Adds a new rule
   * 
   * @param name
   *            the name of the rule
   * @param rule
   *            the rule
   */
  public final void addRule(String name, V rule) {
    if (ignoreSpace)
      ignoreSpace = !startsWithSpace(rule.getPattern().pattern());
    rules.add(name, rule);
  }
  
  /**
   * Adds a new descender
   * 
   * @param name
   *            the name of the descender
   * @param descender
   *            the descender
   */
  public final void addDescender(String name, W descender) {
    if (ignoreSpace)
      ignoreSpace = !(descender.open.charAt(0) == ' ' || descender.close.charAt(0) == ' ');
    descenders.add(name, descender);
  }
  
  /**
   * Tells the lexer to skip over the <tt>Pattern</tt> in the given regex <tt>String</tt>.
   * 
   * @param ignore
   *            the <tt>Pattern</tt> to ignore as a regex <tt>String</tt>
   */
  public final void ignore(String ignore) {
    ignores.add(Pattern.compile(ignore.startsWith("\\G") ? ignore : "\\G" + ignore));
  }
  
  /**
   * Tells the lexer to skip over the <tt>Pattern</tt> in the given regex <tt>String</tt>.
   * 
   * @param ignore
   *            the <tt>Pattern</tt> to ignore as a regex <tt>String</tt>
   * @param flags
   *            the regex flags defined in {@link java.util.regex.Pattern Pattern}
   */
  public final void ignore(String ignore, int flags) {
    ignores.add(Pattern.compile((ignore.startsWith("\\G") ? ignore : "\\G" + ignore), flags));
  }
  
  /**
   * @return the types that this lexer can find.
   */
  public ArrayList<U> getTypes() {
    return types;
  }
  
  protected int getEndIndex(String input, int start, String startSymbol, String endSymbol) throws UnbalancedDescenderException {
    int index = 0, parenthesis = 0;
    for (int i = start; i < input.length() - startSymbol.length() + 1 && i < input.length() - endSymbol.length() + 1; i++) {
      if (input.substring(i, i + startSymbol.length()).equals(startSymbol))
        parenthesis++;
      if (input.substring(i, i + endSymbol.length()).equals(endSymbol))
        parenthesis--;
      if (parenthesis == 0) {
        index = i;
        break;
      }
      if (input.charAt(i) == '\\') {
        i++;
        continue;
      }
    }
    if (parenthesis != 0)
      throw new UnbalancedDescenderException(input, start);
    return index;
  }
  
  /**
   * Due to how type-erasure works, this method must be initialized in subclasses with the following code:</br>
   * <code>return new {@literal <}class extending <tt>Token</tt>{@literal >}();</code>
   */
  public abstract T makeNewToken();
  
  /**
   * Due to how type-erasure works, this method must be initialized in subclasses with the following code:</br>
   * <code>return d.apply(m, this);</code>
   */
  protected abstract T descend(W d, Matcher m) throws LexerException;
  
  /**
   * Due to how type-erasure works, this method must be initialized in subclasses with the following code:</br>
   * <code>return r.apply(m, this);</code>
   */
  protected abstract T hit(V r, Matcher m) throws LexerException;
}
Source Code of lexer.abstractLexer.AbstractLexer

Related Classes of lexer.abstractLexer.AbstractLexer