Source Code of net.percederberg.grammatica.parser.re.RegExp

/*
 * RegExp.java
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 3
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307, USA.
 *
 * Copyright (c) 2003-2009 Per Cederberg. All rights reserved.
 */


package net.percederberg.grammatica.parser.re;


import java.io.PrintWriter;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;


import net.percederberg.grammatica.parser.ReaderBuffer;


/**
 * A regular expression. This class creates and holds an internal
 * data structure representing a regular expression. It also allows
 * creating matchers. This class is thread-safe. Multiple matchers may
 * operate simultanously on the same regular expression.
 *
 * @author   Per Cederberg, <per at percederberg dot net>
 * @version  1.5
 */
public class RegExp {


    /**
     * The base regular expression element.
     */
    private Element element;


    /**
     * The regular expression pattern.
     */
    private String pattern;


    /**
     * The character case ignore flag.
     */
    private boolean ignoreCase;


    /**
     * The current position in the pattern. This variable is used by
     * the parsing methods.
     */
    private int pos;


    /**
     * Creates a new case-sensitive regular expression.
     *
     * @param pattern        the regular expression pattern
     *
     * @throws RegExpException if the regular expression couldn't be
     *             parsed correctly
     */
    public RegExp(String pattern) throws RegExpException {
        this(pattern, false);
    }


    /**
     * Creates a new regular expression. The regular expression can be
     * either case-sensitive or case-insensitive.
     *
     * @param pattern        the regular expression pattern
     * @param ignoreCase     the character case ignore flag
     *
     * @throws RegExpException if the regular expression couldn't be
     *             parsed correctly
     *
     * @since 1.5
     */
    public RegExp(String pattern, boolean ignoreCase)
        throws RegExpException {


        this.pattern = pattern;
        this.ignoreCase = ignoreCase;
        this.pos = 0;
        this.element = parseExpr();
        if (pos < pattern.length()) {
            throw new RegExpException(
                RegExpException.UNEXPECTED_CHARACTER,
                pos,
                pattern);
        }
    }


    /**
     * Creates a new matcher for the specified string.
     *
     * @param str            the string to work with
     *
     * @return the regular expresion matcher
     *
     * @deprecated The CharBuffer class has been deprecated in favor
     * of ReaderBuffer as of version 1.5. Create a ReaderBuffer
     * and use the matcher(ReaderBuffer) method instead of this one.
     */
    public Matcher matcher(CharBuffer str) {
        return matcher(str.toString());
    }


    /**
     * Creates a new matcher for the specified string.
     *
     * @param str            the string to work with
     *
     * @return the regular expression matcher
     */
    public Matcher matcher(String str) {
        return matcher(new ReaderBuffer(new StringReader(str)));
    }


    /**
     * Creates a new matcher for the specified look-ahead character
     * input stream.
     *
     * @param buffer         the character input buffer
     *
     * @return the regular expression matcher
     *
     * @since 1.5
     */
    public Matcher matcher(ReaderBuffer buffer) {
        return new Matcher((Element) element.clone(), buffer, ignoreCase);
    }


    /**
     * Returns a string representation of the regular expression.
     *
     * @return a string representation of the regular expression
     */
    public String toString() {
        StringWriter  str;


        str = new StringWriter();
        str.write("Regular Expression\n");
        str.write("  Pattern: " + pattern + "\n");
        str.write("  Flags:");
        if (ignoreCase) {
            str.write(" caseignore");
        }
        str.write("\n");
        str.write("  Compiled:\n");
        element.printTo(new PrintWriter(str), "    ");
        return str.toString();
    }


    /**
     * Parses a regular expression. This method handles the Expr
     * production in the grammar (see regexp.grammar).
     *
     * @return the element representing this expression
     *
     * @throws RegExpException if an error was encountered in the
     *             pattern string
     */
    private Element parseExpr() throws RegExpException {
        Element  first;
        Element  second;


        first = parseTerm();
        if (peekChar(0) != '|') {
            return first;
        } else {
            readChar('|');
            second = parseExpr();
            return new AlternativeElement(first, second);
        }
    }


    /**
     * Parses a regular expression term. This method handles the
     * Term production in the grammar (see regexp.grammar).
     *
     * @return the element representing this term
     *
     * @throws RegExpException if an error was encountered in the
     *             pattern string
     */
    private Element parseTerm() throws RegExpException {
        ArrayList  list = new ArrayList();


        list.add(parseFact());
        while (true) {
            switch (peekChar(0)) {
            case -1:
            case ')':
            case ']':
            case '{':
            case '}':
            case '?':
            case '+':
            case '|':
                return combineElements(list);
            default:
                list.add(parseFact());
            }
        }
    }


    /**
     * Parses a regular expression factor. This method handles the
     * Fact production in the grammar (see regexp.grammar).
     *
     * @return the element representing this factor
     *
     * @throws RegExpException if an error was encountered in the
     *             pattern string
     */
    private Element parseFact() throws RegExpException {
        Element  elem;


        elem = parseAtom();
        switch (peekChar(0)) {
        case '?':
        case '*':
        case '+':
        case '{':
            return parseAtomModifier(elem);
        default:
            return elem;
        }
    }


    /**
     * Parses a regular expression atom. This method handles the
     * Atom production in the grammar (see regexp.grammar).
     *
     * @return the element representing this atom
     *
     * @throws RegExpException if an error was encountered in the
     *             pattern string
     */
    private Element parseAtom() throws RegExpException {
        Element  elem;


        switch (peekChar(0)) {
        case '.':
            readChar('.');
            return CharacterSetElement.DOT;
        case '(':
            readChar('(');
            elem = parseExpr();
            readChar(')');
            return elem;
        case '[':
            readChar('[');
            elem = parseCharSet();
            readChar(']');
            return elem;
        case -1:
        case ')':
        case ']':
        case '{':
        case '}':
        case '?':
        case '*':
        case '+':
        case '|':
            throw new RegExpException(
                RegExpException.UNEXPECTED_CHARACTER,
                pos,
                pattern);
        default:
            return parseChar();
        }
    }


    /**
     * Parses a regular expression atom modifier. This method handles
     * the AtomModifier production in the grammar (see regexp.grammar).
     *
     * @param elem           the element to modify
     *
     * @return the modified element
     *
     * @throws RegExpException if an error was encountered in the
     *             pattern string
     */
    private Element parseAtomModifier(Element elem) throws RegExpException {
        int  min = 0;
        int  max = -1;
        int  type = RepeatElement.GREEDY;
        int  firstPos;


        // Read min and max
        switch (readChar()) {
        case '?':
            min = 0;
            max = 1;
            break;
        case '*':
            min = 0;
            max = -1;
            break;
        case '+':
            min = 1;
            max = -1;
            break;
        case '{':
            firstPos = pos - 1;
            min = readNumber();
            max = min;
            if (peekChar(0) == ',') {
                readChar(',');
                max = -1;
                if (peekChar(0) != '}') {
                    max = readNumber();
                }
            }
            readChar('}');
            if (max == 0 || (max > 0 && min > max)) {
                throw new RegExpException(
                    RegExpException.INVALID_REPEAT_COUNT,
                    firstPos,
                    pattern);
            }
            break;
        default:
            throw new RegExpException(
                RegExpException.UNEXPECTED_CHARACTER,
                pos - 1,
                pattern);
        }


        // Read operator mode
        if (peekChar(0) == '?') {
            readChar('?');
            type = RepeatElement.RELUCTANT;
        } else if (peekChar(0) == '+') {
            readChar('+');
            type = RepeatElement.POSSESSIVE;
        }


        return new RepeatElement(elem, min, max, type);
    }


    /**
     * Parses a regular expression character set. This method handles
     * the contents of the '[...]' construct in a regular expression.
     *
     * @return the element representing this character set
     *
     * @throws RegExpException if an error was encountered in the
     *             pattern string
     */
    private Element parseCharSet() throws RegExpException {
        CharacterSetElement  charset;
        Element              elem;
        boolean              repeat = true;
        char                 start;
        char                 end;


        if (peekChar(0) == '^') {
            readChar('^');
            charset = new CharacterSetElement(true);
        } else {
            charset = new CharacterSetElement(false);
        }


        while (peekChar(0) > 0 && repeat) {
            start = (char) peekChar(0);
            switch (start) {
            case ']':
                repeat = false;
                break;
            case '\\':
                elem = parseEscapeChar();
                if (elem instanceof StringElement) {
                    charset.addCharacters((StringElement) elem);
                } else {
                    charset.addCharacterSet((CharacterSetElement) elem);
                }
                break;
            default:
                readChar(start);
                if (peekChar(0) == '-'
                 && peekChar(1) > 0
                 && peekChar(1) != ']') {


                    readChar('-');
                    end = readChar();
                    charset.addRange(fixChar(start), fixChar(end));
                } else {
                    charset.addCharacter(fixChar(start));
                }
            }
        }


        return charset;
    }


    /**
     * Parses a regular expression character. This method handles
     * a single normal character in a regular expression.
     *
     * @return the element representing this character
     *
     * @throws RegExpException if an error was encountered in the
     *             pattern string
     */
    private Element parseChar() throws RegExpException {
        switch (peekChar(0)) {
        case '\\':
            return parseEscapeChar();
        case '^':
        case '$':
            throw new RegExpException(
                RegExpException.UNSUPPORTED_SPECIAL_CHARACTER,
                pos,
                pattern);
        default:
            return new StringElement(fixChar(readChar()));
        }
    }


    /**
     * Parses a regular expression character escape. This method
     * handles a single character escape in a regular expression.
     *
     * @return the element representing this character escape
     *
     * @throws RegExpException if an error was encountered in the
     *             pattern string
     */
    private Element parseEscapeChar() throws RegExpException {
        char    c;
        String  str;


        readChar('\\');
        c = readChar();
        switch (c) {
        case '0':
            c = readChar();
            if (c < '0' || c > '3') {
                throw new RegExpException(
                    RegExpException.UNSUPPORTED_ESCAPE_CHARACTER,
                    pos - 3,
                    pattern);
            }
            str = String.valueOf(c);
            c = (char) peekChar(0);
            if ('0' <= c && c <= '7') {
                str += String.valueOf(readChar());
                c = (char) peekChar(0);
                if ('0' <= c && c <= '7') {
                    str += String.valueOf(readChar());
                }
            }
            try {
                c = (char) Integer.parseInt(str, 8);
                return new StringElement(fixChar(c));
            } catch (NumberFormatException e) {
                throw new RegExpException(
                    RegExpException.UNSUPPORTED_ESCAPE_CHARACTER,
                    pos - str.length() - 2,
                    pattern);
            }
        case 'x':
            str = String.valueOf(readChar()) +
                  String.valueOf(readChar());
            try {
                c = (char) Integer.parseInt(str, 16);
                return new StringElement(fixChar(c));
            } catch (NumberFormatException e) {
                throw new RegExpException(
                    RegExpException.UNSUPPORTED_ESCAPE_CHARACTER,
                    pos - str.length() - 2,
                    pattern);
            }
        case 'u':
            str = String.valueOf(readChar()) +
                  String.valueOf(readChar()) +
                  String.valueOf(readChar()) +
                  String.valueOf(readChar());
            try {
                c = (char) Integer.parseInt(str, 16);
                return new StringElement(fixChar(c));
            } catch (NumberFormatException e) {
                throw new RegExpException(
                    RegExpException.UNSUPPORTED_ESCAPE_CHARACTER,
                    pos - str.length() - 2,
                    pattern);
            }
        case 't':
            return new StringElement('\t');
        case 'n':
            return new StringElement('\n');
        case 'r':
            return new StringElement('\r');
        case 'f':
            return new StringElement('\f');
        case 'a':
            return new StringElement('\u0007');
        case 'e':
            return new StringElement('\u001B');
        case 'd':
            return CharacterSetElement.DIGIT;
        case 'D':
            return CharacterSetElement.NON_DIGIT;
        case 's':
            return CharacterSetElement.WHITESPACE;
        case 'S':
            return CharacterSetElement.NON_WHITESPACE;
        case 'w':
            return CharacterSetElement.WORD;
        case 'W':
            return CharacterSetElement.NON_WORD;
        default:
            if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')) {
                throw new RegExpException(
                    RegExpException.UNSUPPORTED_ESCAPE_CHARACTER,
                    pos - 2,
                    pattern);
            }
            return new StringElement(fixChar(c));
        }
    }


    /**
     * Adjusts a character for inclusion in a string or character set
     * element. For case-insensitive regular expressions, this
     * transforms the character to lower-case.
     *
     * @param c               the input character
     *
     * @return the adjusted character
     */
    private char fixChar(char c) {
        return ignoreCase ? Character.toLowerCase(c) : c;
    }


    /**
     * Reads a number from the pattern. If the next character isn't a
     * numeric character, an exception is thrown. This method reads
     * several consecutive numeric characters.
     *
     * @return the numeric value read
     *
     * @throws RegExpException if an error was encountered in the
     *             pattern string
     */
    private int readNumber() throws RegExpException {
        StringBuffer  buf = new StringBuffer();
        int           c;


        c = peekChar(0);
        while ('0' <= c && c <= '9') {
            buf.append(readChar());
            c = peekChar(0);
        }
        if (buf.length() <= 0) {
            throw new RegExpException(
                RegExpException.UNEXPECTED_CHARACTER,
                pos,
                pattern);
        }
        return Integer.parseInt(buf.toString());
    }


    /**
     * Reads the next character in the pattern. If no next character
     * exists, an exception is thrown.
     *
     * @return the character read
     *
     * @throws RegExpException if no next character was available in
     *             the pattern string
     */
    private char readChar() throws RegExpException {
        int  c = peekChar(0);


        if (c < 0) {
            throw new RegExpException(
                RegExpException.UNTERMINATED_PATTERN,
                pos,
                pattern);
        } else {
            pos++;
            return (char) c;
        }
    }


    /**
     * Reads the next character in the pattern. If the character
     * wasn't the specified one, an exception is thrown.
     *
     * @param c              the character to read
     *
     * @return the character read
     *
     * @throws RegExpException if the character read didn't match the
     *             specified one, or if no next character was
     *             available in the pattern string
     */
    private char readChar(char c) throws RegExpException {
        if (c != readChar()) {
            throw new RegExpException(
                RegExpException.UNEXPECTED_CHARACTER,
                pos - 1,
                pattern);
        }
        return c;
    }


    /**
     * Returns a character that has not yet been read from the
     * pattern. If the requested position is beyond the end of the
     * pattern string, -1 is returned.
     *
     * @param count          the preview position, from zero (0)
     *
     * @return the character found, or
     *         -1 if beyond the end of the pattern string
     */
    private int peekChar(int count) {
        if (pos + count < pattern.length()) {
            return pattern.charAt(pos + count);
        } else {
            return -1;
        }
    }


    /**
     * Combines a list of elements. This method takes care to always
     * concatenate adjacent string elements into a single string
     * element.
     *
     * @param list           the list with elements
     *
     * @return the combined element
     */
    private Element combineElements(ArrayList list) {
        Element  prev;
        Element  elem;
        String   str;
        int      i;


        // Concatenate string elements
        prev = (Element) list.get(0);
        for (i = 1; i < list.size(); i++) {
            elem = (Element) list.get(i);
            if (prev instanceof StringElement
             && elem instanceof StringElement) {


                str = ((StringElement) prev).getString() +
                      ((StringElement) elem).getString();
                elem = new StringElement(str);
                list.remove(i);
                list.set(i - 1, elem);
                i--;
            }
            prev = elem;
        }


        // Combine all remaining elements
        elem = (Element) list.get(list.size() - 1);
        for (i = list.size() - 2; i >= 0; i--) {
            prev = (Element) list.get(i);
            elem = new CombineElement(prev, elem);
        }
        return elem;
    }
}
Source Code of net.percederberg.grammatica.parser.re.RegExp

Related Classes of net.percederberg.grammatica.parser.re.RegExp