package client.net.sf.saxon.ce.regex;
import client.net.sf.saxon.ce.expr.z.*;
import client.net.sf.saxon.ce.tree.util.FastStringBuffer;
import client.net.sf.saxon.ce.value.Whitespace;
import com.google.gwt.logging.client.LogConfiguration;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* A regular expression compiler class. This class compiles a pattern string into a
* regular expression program interpretable by the RE evaluator class. The 'recompile'
* command line tool uses this compiler to pre-compile regular expressions for use
* with RE. For a description of the syntax accepted by RECompiler and what you can
* do with regular expressions, see the documentation for the RE matcher class.
*
* @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
* @author <a href="mailto:gholam@xtra.co.nz">Michael McCallum</a>
* @version $Id: RECompiler.java 518156 2007-03-14 14:31:26Z vgritsenko $
* @see REMatcher
*/
/*
* Changes made for Saxon:
*
* - handle full Unicode repertoire (esp non-BMP characters) using UnicodeString class for
* both the source string and the regular expression
* - added support for subtraction in a character class
* - in a character range, changed the condition start < end to start <= end
* - removed support for [:POSIX:] construct
* - added support for \p{} and \P{} classes
* - removed support for unsupported escapes: f, x, u, b, octal characters; added i and c
* - changed the handling of hyphens within square brackets, and ^ appearing other than at the start
* - changed the data structure used for the executable so that terms that match a character class
* now reference an IntPredicate that tests for membership of the character in a set
* - added support for reluctant {n,m}? quantifiers
* - allow a quantifier on a nullable expression [syntax permitted; semantics need more work]
* - allow a quantifier on '$' or '^'
* - some constructs (back-references, non-capturing groups, etc) are conditional on which XPath/XSD version
* is in use
* - regular expression flags are now fixed at the time the RE is compiled, this can no longer be deferred
* until the RE is evaluated
* - split() function includes a zero-length string at the end of the returned sequence if the last
* separator is at the end of the string
* - added support for the 'q' and 'x' flags; improved support for the 'i' flag
* - added a method to determine whether there is an anchored match (for XSD use)
* - tests for newline (e.g in multiline mode) now match \n only, as required by the XPath specification
* - reorganised the executable program to use Operation objects rather than integer opcodes
* - introduced optimization for non-backtracking + and * operators (with simple operands)
*/
public class RECompiler {
// The compiled program
ArrayList<Operation> instructions = new ArrayList<Operation>(20);
// Input state for compiling regular expression
UnicodeString pattern; // Input string
int len; // Length of the pattern string
int idx; // Current input index into ac
int parens; // Total number of paren pairs
// Node flags
static final int NODE_NORMAL = 0; // No flags (nothing special)
static final int NODE_NULLABLE = 1; // True if node is potentially null
static final int NODE_TOPLEVEL = 2; // True if top level expr
// {m,n} stacks
static final int bracketUnbounded = -1; // Unbounded value
int bracketMin; // Minimum number of matches
int bracketOpt; // Additional optional matches
boolean isXPath = true;
boolean isXPath30 = true;
IntHashSet captures = new IntHashSet();
REFlags reFlags;
List<String> warnings;
/**
* Constructor. Creates (initially empty) storage for a regular expression program.
*/
public RECompiler() {
}
/**
* Set the regular expression flags to be used
* @param flags the regular expression flags
*/
public void setFlags(REFlags flags) {
this.reFlags = flags;
isXPath = flags.isAllowsXPath20Extensions();
isXPath30 = flags.isAllowsXPath30Extensions();
}
private void insertNode(Operation node, int insertAt) {
instructions.add(insertAt, node);
}
private void warning(String s) {
if (warnings == null) {
warnings = new ArrayList<String>(4);
}
warnings.add(s);
}
/**
* On completion of compilation, get any warnings that were generated
* @return the list of warning messages
*/
public List<String> getWarnings() {
if (warnings == null) {
return Collections.emptyList();
} else {
return warnings;
}
}
/**
* Appends a node to the end of a node chain
*
* @param node Start of node chain to traverse
* @param pointTo Node to have the tail of the chain point to
*/
void setNextOfEnd(int node, int pointTo) {
//System.err.println("NEW nextOfEnd " + node + " " + pointTo);
// Traverse the chain until the next offset is 0
int next = instructions.get(node).next;
// while the 'node' is not the last in the chain
// and the 'node' is not the last in the program.
while (next != 0 && node < instructions.size()) {
// if the node we are supposed to point to is in the chain then
// point to the end of the program instead.
// Michael McCallum <gholam@xtra.co.nz>
// FIXME: This is a _hack_ to stop infinite programs.
// I believe that the implementation of the reluctant matches is wrong but
// have not worked out a better way yet.
if (node == pointTo) {
pointTo = instructions.size();
}
node += next;
next = instructions.get(node).next;
}
// if we have reached the end of the program then dont set the pointTo.
// im not sure if this will break any thing but passes all the tests.
if (node < instructions.size()) {
int offset = pointTo - node;
// Point the last node in the chain to pointTo.
instructions.get(node).next = offset;
}
}
// /**
// * Adds a new node
// *
// * @param opcode Opcode for node
// * @param opdata Opdata for node
// * @return Index of new node in program
// */
// int node(int opcode, int opdata) {
// // Make room for a new node
// ensure(RE.nodeSize);
//
// // Add new node at end
// instruction[lenInstruction /* + RE.offsetOpcode */] = opcode;
// instruction[lenInstruction + RE.offsetOpdata] = opdata;
// instruction[lenInstruction + RE.offsetNext] = 0;
// lenInstruction += RE.nodeSize;
//
// // Return index of new node
// return lenInstruction - RE.nodeSize;
// }
/**
* Throws a new internal error exception
*
* @throws Error Thrown in the event of an internal error.
*/
void internalError() throws Error {
throw new Error("Internal error!");
}
/**
* Throws a new syntax error exception
* @param s the error message
* @throws RESyntaxException Thrown if the regular expression has invalid syntax.
*/
void syntaxError(String s) throws RESyntaxException {
if (LogConfiguration.loggingIsEnabled()) {
throw new RESyntaxException(s, idx);
} else {
throw new RESyntaxException("", idx);
}
}
/**
* Match bracket {m,n} expression put results in bracket member variables
*
* @throws RESyntaxException Thrown if the regular expression has invalid syntax.
*/
void bracket() throws RESyntaxException {
// Current character must be a '{'
if (idx >= len || pattern.charAt(idx++) != '{') {
internalError();
}
// Next char must be a digit
if (idx >= len || !isAsciiDigit(pattern.charAt(idx))) {
syntaxError("Expected digit");
}
// Get min ('m' of {m,n}) number
StringBuffer number = new StringBuffer();
while (idx < len && isAsciiDigit(pattern.charAt(idx))) {
number.append((char)pattern.charAt(idx++));
}
try {
bracketMin = Integer.parseInt(number.toString());
} catch (NumberFormatException e) {
syntaxError("Expected valid number");
}
// If out of input, fail
if (idx >= len) {
syntaxError("Expected comma or right bracket");
}
// If end of expr, optional limit is 0
if (pattern.charAt(idx) == '}') {
idx++;
bracketOpt = 0;
return;
}
// Must have at least {m,} and maybe {m,n}.
if (idx >= len || pattern.charAt(idx++) != ',') {
syntaxError("Expected comma");
}
// If out of input, fail
if (idx >= len) {
syntaxError("Expected comma or right bracket");
}
// If {m,} max is unlimited
if (pattern.charAt(idx) == '}') {
idx++;
bracketOpt = bracketUnbounded;
return;
}
// Next char must be a digit
if (idx >= len || !isAsciiDigit(pattern.charAt(idx))) {
syntaxError("Expected digit");
}
// Get max number
number.setLength(0);
while (idx < len && isAsciiDigit(pattern.charAt(idx))) {
number.append((char)pattern.charAt(idx++));
}
try {
bracketOpt = Integer.parseInt(number.toString()) - bracketMin;
} catch (NumberFormatException e) {
syntaxError("Expected valid number");
}
// Optional repetitions must be >= 0
if (bracketOpt < 0) {
syntaxError("Bad range");
}
// Must have close brace
if (idx >= len || pattern.charAt(idx++) != '}') {
syntaxError("Missing close brace");
}
}
/**
* Test whether a character is an ASCII decimal digit
* @param ch the character to be matched
* @return true if the character is an ASCII digit (0-9)
*/
private static boolean isAsciiDigit(int ch) {
return ch >= '0' && ch <= '9';
}
/**
* Match an escape sequence. Handles quoted chars and octal escapes as well
* as normal escape characters. Always advances the input stream by the
* right amount. This code "understands" the subtle difference between an
* octal escape and a backref. You can access the type of ESC_CLASS or
* ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
*
* @return an IntPredicate that matches the character or characters represented
* by this escape sequence. For a single-character escape this must be an IntValuePredicate
* @throws RESyntaxException Thrown if the regular expression has invalid syntax.
*/
IntPredicate escape(boolean inSquareBrackets) throws RESyntaxException {
// "Shouldn't" happen
if (pattern.charAt(idx) != '\\') {
internalError();
}
// Escape shouldn't occur as last character in string!
if (idx + 1 == len) {
syntaxError("Escape terminates string");
}
// Switch on character after backslash
idx += 2;
int escapeChar = pattern.charAt(idx - 1);
switch (escapeChar) {
case 'n':
return new IntValuePredicate('\n');
case 'r':
return new IntValuePredicate('\r');
case 't':
return new IntValuePredicate('\t');
case '\\':
case '|':
case '.':
case '-':
case '^':
case '?':
case '*':
case '+':
case '{':
case '}':
case '(':
case ')':
case '[':
case ']':
return new IntValuePredicate(escapeChar);
case '$':
if (isXPath) {
return new IntValuePredicate(escapeChar);
} else {
syntaxError("In XSD, '$' must not be escaped");
}
case 's':
return MultiCharEscape.ESCAPE_s;
case 'S':
return MultiCharEscape.ESCAPE_S;
case 'i':
return MultiCharEscape.ESCAPE_i;
case 'I':
return MultiCharEscape.ESCAPE_I;
case 'c':
return MultiCharEscape.ESCAPE_c;
case 'C':
return MultiCharEscape.ESCAPE_C;
case 'd':
return MultiCharEscape.ESCAPE_d;
case 'D':
return MultiCharEscape.ESCAPE_D;
case 'w':
return MultiCharEscape.ESCAPE_w;
case 'W':
return MultiCharEscape.ESCAPE_W;
case 'p':
case 'P':
if (idx == len) {
syntaxError("Expected '{' after \\" + escapeChar);
}
if (pattern.charAt(idx) != '{') {
syntaxError("Expected '{' after \\" + escapeChar);
}
int close = pattern.indexOf('}', idx++);
if (close == -1) {
syntaxError("No closing '}' after \\" + escapeChar);
}
UnicodeString block = pattern.substring(idx, close);
if (block.length() == 1 && block.charAt(0) < 256) {
IntPredicate primary = null;
try {
primary = MultiCharEscape.getCategoryCharClass((char)block.charAt(0));
} catch (IllegalArgumentException err) {
syntaxError(err.getMessage());
}
idx = close+1;
if (escapeChar == 'p') {
return primary;
} else {
return makeComplement(primary);
}
} else if (block.length() == 2) {
IntPredicate primary = null;
try {
primary = new IntSetPredicate(MultiCharEscape.getSubCategoryCharClass(block.toString()));
} catch (IllegalArgumentException err) {
syntaxError(err.getMessage());
}
idx = close+1;
if (escapeChar == 'p') {
return primary;
} else {
return makeComplement(primary);
}
} else if (block.toString().startsWith("Is")) {
String blockName = block.toString().substring(2);
IntSet uniBlock = UnicodeBlocks.getBlock(blockName);
if (uniBlock == null) {
// XSD 1.1 says this is not an error
warning("Unknown Unicode block: " + blockName);
idx = close+1;
return new IntSetPredicate(IntUniversalSet.getInstance());
}
idx = close+1;
IntPredicate primary = new IntSetPredicate(uniBlock);
if (escapeChar == 'p') {
return primary;
} else {
return makeComplement(primary);
}
} else {
syntaxError("Unknown block: " + block);
}
case '0':
syntaxError("Octal escapes not allowed");
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
if (inSquareBrackets) {
syntaxError("Backreference not allowed within character class");
} else if (isXPath) {
int backRef = (escapeChar - '0');
while (idx < len) {
int c1 = "0123456789".indexOf(pattern.charAt(idx));
if (c1 < 0) {
break;
} else {
int backRef2 = backRef * 10 + c1;
if (backRef2 > parens) {
break;
} else {
backRef = backRef2;
idx++;
}
}
}
if (!captures.contains(backRef)) {
String explanation = (backRef > parens ? "(no such group)" : "(group not yet closed)");
syntaxError("invalid backreference \\" + backRef + " " + explanation);
}
return new BackReference(backRef);
} else {
syntaxError("digit not allowed after \\");
}
default:
// Other characters not allowed in XSD regexes
syntaxError("Escape character '" + (char)escapeChar + "' not allowed");
}
return null;
}
/**
* For convenience a back-reference is treated as an IntPredicate, although this a fiction
*/
class BackReference extends IntValuePredicate {
public BackReference(int number) {
super(number);
}
}
/**
* Compile a character class (in square brackets)
*
* @return an IntPredicate that tests whether a character matches this character class
* @throws RESyntaxException Thrown if the regular expression has invalid syntax.
*/
IntPredicate parseCharacterClass() throws RESyntaxException {
// Check for bad calling or empty class
if (pattern.charAt(idx) != '[') {
internalError();
}
// Check for unterminated or empty class
if ((idx + 1) >= len || pattern.charAt(++idx) == ']') {
syntaxError("Missing ']'");
}
// Parse class declaration
int simpleChar;
boolean positive = true;
boolean definingRange = false;
int rangeStart = -1;
int rangeEnd;
IntRangeSet range = new IntRangeSet();
IntPredicate addend = null;
IntPredicate subtrahend = null;
if (thereFollows("^")) {
if (thereFollows("^-[")) {
syntaxError("Nothing before subtraction operator");
} else if (thereFollows("^]")) {
syntaxError("Empty negative character group");
} else {
positive = false;
idx++;
}
} else if (thereFollows("-[")) {
syntaxError("Nothing before subtraction operator");
}
while (idx < len && pattern.charAt(idx) != ']') {
int ch = pattern.charAt(idx);
simpleChar = -1;
switch (ch) {
case '[':
syntaxError("Unescaped '[' within square brackets");
break;
case '\\': {
// Escape always advances the stream
IntPredicate cc = escape(true);
if (cc instanceof IntValuePredicate) {
simpleChar = ((IntValuePredicate) cc).getTarget();
break;
} else {
if (definingRange) {
syntaxError("Multi-character escape cannot follow '-'");
} else if (addend == null) {
addend = cc;
} else {
addend = makeUnion(addend, cc);
}
continue;
}
}
case '-':
if (thereFollows("-[")) {
idx++;
subtrahend = parseCharacterClass();
if (!thereFollows("]")) {
syntaxError("Expected closing ']' after subtraction");
}
} else if (thereFollows("-]")) {
simpleChar = '-';
idx++;
} else if (rangeStart >= 0) {
definingRange = true;
idx++;
continue;
} else if (definingRange) {
syntaxError("Bad range");
} else if (thereFollows("--") && !thereFollows("--[")) {
syntaxError("Unescaped hyphen as start of range");
} else {
simpleChar = '-';
idx++;
}
break;
default:
simpleChar = ch;
idx++;
break;
}
// Handle simple character simpleChar
if (definingRange) {
// if we are defining a range make it now
rangeEnd = simpleChar;
// Actually create a range if the range is ok
if (rangeStart > rangeEnd) {
syntaxError("Bad character range: start > end");
// TODO: not an error in XSD, merely a no-op?
}
range.addRange(rangeStart, rangeEnd);
if (reFlags.isCaseIndependent()) {
// Special-case A-Z and a-z
if (rangeStart == 'a' && rangeEnd == 'z') {
range.addRange('A', 'Z');
for (int v=0; v<CaseVariants.ROMAN_VARIANTS.length; v++) {
range.add(CaseVariants.ROMAN_VARIANTS[v]);
}
} else if (rangeStart == 'A' && rangeEnd == 'Z') {
range.addRange('a', 'z');
for (int v=0; v<CaseVariants.ROMAN_VARIANTS.length; v++) {
range.add(CaseVariants.ROMAN_VARIANTS[v]);
}
} else {
for (int k = rangeStart; k <= rangeEnd; k++) {
int[] variants = CaseVariants.getCaseVariants(k);
for (int variant : variants) {
range.add(variant);
}
}
}
}
// We are done defining the range
definingRange = false;
rangeStart = -1;
} else {
// If simple character and not start of range, include it (see XSD 1.1 rules)
if (thereFollows("-")) {
if (thereFollows("-[")) {
range.add(simpleChar);
} else if (thereFollows("-]")) {
range.add(simpleChar);
} else if (thereFollows("--[")) {
range.add(simpleChar);
} else if (thereFollows("--")) {
syntaxError("Unescaped hyphen cannot act as end of range");
} else {
rangeStart = simpleChar;
}
} else {
range.add(simpleChar);
if (reFlags.isCaseIndependent()) {
int[] variants = CaseVariants.getCaseVariants(simpleChar);
for (int variant : variants) {
range.add(variant);
}
}
}
}
}
// Shouldn't be out of input
if (idx == len) {
syntaxError("Unterminated character class");
}
// Absorb the ']' end of class marker
idx++;
IntPredicate result = new IntSetPredicate(range);
if (addend != null) {
result = makeUnion(result, addend);
}
if (!positive) {
result = makeComplement(result);
}
if (subtrahend != null) {
result = makeDifference(result, subtrahend);
}
return result;
}
/**
* Test whether the string starting at the current position is equal to some specified string
* @param s the string being tested
* @return true if the specified string is present
*/
private boolean thereFollows(String s) {
return idx + s.length() <= len &&
(pattern.substring(idx, idx + s.length()).toString().equals(s));
}
/**
* Make the union of two IntPredicates (matches if p1 matches or p2 matches)
* @param p1 the first
* @param p2 the second
* @return the result
*/
private IntPredicate makeUnion(IntPredicate p1, IntPredicate p2) {
if (p1 instanceof IntSetPredicate && ((IntSetPredicate)p1).getIntSet(). isEmpty()) {
return p2;
}
if (p2 instanceof IntSetPredicate && ((IntSetPredicate)p2).getIntSet(). isEmpty()) {
return p1;
}
return new IntUnionPredicate(p1, p2);
}
/**
* Make the difference of two IntPredicates (matches if p1 matches and p2 does not match)
* @param p1 the first
* @param p2 the second
* @return the result
*/
private IntPredicate makeDifference(IntPredicate p1, IntPredicate p2) {
return new IntExceptPredicate(p1, p2);
}
/**
* Make the complement of an IntPredicate (matches if p1 does not match)
* @param p1 the operand
* @return the result
*/
private IntPredicate makeComplement(IntPredicate p1) {
if (p1 instanceof IntComplementPredicate) {
return ((IntComplementPredicate)p1).getOperand();
} else {
return new IntComplementPredicate(p1);
}
}
private int emitCharacterClass(IntPredicate range) {
Operation.OpCharClass node = new Operation.OpCharClass();
node.predicate = range;
return appendNode(node);
}
/**
* Absorb an atomic character string. This method is a little tricky because
* it can un-include the last character of string if a quantifier operator follows.
* This is correct because *+? have higher precedence than concatentation (thus
* ABC* means AB(C*) and NOT (ABC)*).
*
* @return Index of new atom node
* @throws RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int atom() throws RESyntaxException {
// Create a string node
Operation.OpAtom node = new Operation.OpAtom();
// Length of atom
int lenAtom = 0;
// Loop while we've got input
FastStringBuffer fsb = new FastStringBuffer(FastStringBuffer.SMALL);
atomLoop:
while (idx < len) {
// Is there a next char?
if ((idx + 1) < len) {
int c = pattern.charAt(idx + 1);
// If the next 'char' is an escape, look past the whole escape
if (pattern.charAt(idx) == '\\') {
int idxEscape = idx;
escape(false);
if (idx < len) {
c = pattern.charAt(idx);
}
idx = idxEscape;
}
// Switch on next char
switch (c) {
case '{':
case '?':
case '*':
case '+':
// If the next character is a quantifier operator and our atom is non-empty, the
// current character should bind to the quantifier operator rather than the atom
if (lenAtom != 0) {
break atomLoop;
}
}
}
// Switch on current char
switch (pattern.charAt(idx)) {
case ']':
case '.':
case '[':
case '(':
case ')':
case '|':
break atomLoop;
case '{':
case '?':
case '*':
case '+':
// We should have an atom by now
if (lenAtom == 0) {
// No atom before quantifier
syntaxError("No expression before quantifier");
}
break atomLoop;
case '\\': {
// Get the escaped character (advances input automatically)
int idxBeforeEscape = idx;
IntPredicate charClass = escape(false);
// Check if it's a simple escape (as opposed to, say, a backreference)
if (charClass instanceof BackReference || !(charClass instanceof IntValuePredicate)) {
// Not a simple escape, so backup to where we were before the escape.
idx = idxBeforeEscape;
break atomLoop;
}
// Add escaped char to atom
fsb.appendWideChar(((IntValuePredicate) charClass).getTarget());
lenAtom++;
break;
}
case '^':
case '$':
if (isXPath) {
break atomLoop;
}
// else fall through ($ is not a metacharacter in XSD)
default:
// Add normal character to atom
fsb.appendWideChar(pattern.charAt(idx++));
lenAtom++;
break;
}
}
// This shouldn't happen
if (fsb.length() == 0) {
internalError();
}
// Emit the instruction into the program
node.atom = GeneralUnicodeString.makeUnicodeString(fsb.condense());
return appendNode(node);
}
private int appendNode(Operation node) {
instructions.add(node);
return instructions.size()-1;
}
/**
* Match a terminal node.
*
* @param flags Flags
* @return Index of terminal node (closeable)
* @throws RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int terminal(int[] flags) throws RESyntaxException {
switch (pattern.charAt(idx)) {
case '$':
if (isXPath) {
idx++;
Operation.OpEOL eol = new Operation.OpEOL();
return appendNode(eol);
}
break;
case '^':
if (isXPath) {
idx++;
Operation.OpBOL bol = new Operation.OpBOL();
return appendNode(bol);
}
break;
case '.':
idx++;
IntPredicate predicate;
if (reFlags.isSingleLine()) {
// in XPath with the 's' flag, '.' matches everything
predicate = new IntPredicate() {
public boolean matches(int value) {
return true;
}
};
} else {
// in XSD, "." matches everything except \n and \r. See also bug 15594.
predicate = new IntPredicate() {
public boolean matches(int value) {
return (value != '\n' && value != '\r');
}
};
}
Operation.OpCharClass dot = new Operation.OpCharClass();
dot.predicate = predicate;
return appendNode(dot);
case '[':
IntPredicate range = parseCharacterClass();
Operation.OpCharClass cc = new Operation.OpCharClass();
cc.predicate = range;
return appendNode(cc);
case '(':
return expr(flags);
case ')':
syntaxError("Unexpected close paren");
case '|':
internalError();
case ']':
syntaxError("Mismatched class");
case 0:
syntaxError("Unexpected end of input");
case '?':
case '+':
case '{':
case '*':
syntaxError("No expression before quantifier");
case '\\': {
// Don't forget, escape() advances the input stream!
int idxBeforeEscape = idx;
IntPredicate esc = escape(false);
if (esc instanceof BackReference) {
int backreference = ((BackReference)esc).getTarget();
if (parens <= backreference) {
syntaxError("Bad backreference");
}
flags[0] |= NODE_NULLABLE;
Operation.OpBackReference back = new Operation.OpBackReference();
back.groupNr = backreference;
return appendNode(back);
} else if (esc instanceof IntSingletonSet) {
// We had a simple escape and we want to have it end up in
// an atom, so we back up and fall though to the default handling
idx = idxBeforeEscape;
flags[0] &= ~NODE_NULLABLE;
} else {
flags[0] &= ~NODE_NULLABLE;
return emitCharacterClass(esc);
//return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
}
}
}
// Everything above either fails or returns.
// If it wasn't one of the above, it must be the start of an atom.
flags[0] &= ~NODE_NULLABLE;
return atom();
}
/**
* Compile a piece consisting of an atom and optional quantifier
*
* @param flags Flags passed by reference
* @return Index of resulting instruction
* @throws RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int piece(int[] flags) throws RESyntaxException {
// Before terminal
int idxBeforeTerminal = idx;
// Values to pass by reference to terminal()
int[] terminalFlags = {NODE_NORMAL};
// Get terminal symbol
int ret = terminal(terminalFlags);
// Or in flags from terminal symbol
flags[0] |= terminalFlags[0];
// Advance input, set NODE_NULLABLE flag and do sanity checks
if (idx >= len) {
return ret;
}
boolean greedy = true;
int quantifierType = pattern.charAt(idx);
switch (quantifierType) {
case '?':
case '*':
// The current node can be null
flags[0] |= NODE_NULLABLE;
// Drop through
case '+':
// Eat quantifier character
idx++;
// Drop through
case '{':
if (quantifierType == '{') {
bracket();
}
Operation op = instructions.get(ret);
if (op instanceof Operation.OpBOL || op instanceof Operation.OpEOL) {
// Pretty meaningless, but legal. If the quantifier allows zero occurrences, ignore the instruction.
// Otherwise, ignore the quantifier
if (quantifierType == '?' || quantifierType == '*' ||
(quantifierType == '{' && bracketMin == 0)) {
instructions.set(ret, new Operation.OpNothing());
} else {
quantifierType = 0;
}
}
if ((terminalFlags[0] & NODE_NULLABLE) != 0) {
if (quantifierType == '?') {
// can ignore the quantifier
quantifierType = 0;
} else if (quantifierType == '+') {
// '*' and '+' are equivalent
quantifierType = '*';
} else if (quantifierType == '{') {
// bounds are meaningless
quantifierType = '*';
}
}
}
// If the next character is a '?', make the quantifier non-greedy (reluctant)
if (idx < len && pattern.charAt(idx) == '?') {
if (!isXPath) {
syntaxError("Reluctant quantifiers are not allowed in XSD");
}
idx++;
greedy = false;
}
if (greedy) {
// Actually do the quantifier now
switch (quantifierType) {
case '{': {
//bracket();
int bracketEnd = idx;
int bracketMin = this.bracketMin;
int bracketOpt = this.bracketOpt;
// Pointer to the last terminal
int pos = ret;
// Process min first
for (int c = 0; c < bracketMin; c++) {
// Rewind stream and run it through again - more matchers coming
idx = idxBeforeTerminal;
setNextOfEnd(pos, pos = terminal(terminalFlags));
}
// Do the right thing for maximum ({m,})
if (bracketOpt == bracketUnbounded) {
// Drop through now and quantifier expression.
// We are done with the {m,} expr, so skip rest
idx = bracketEnd;
Operation.OpStar op = new Operation.OpStar();
insertNode(op, pos);
setNextOfEnd(pos + 1, pos);
break;
} else if (bracketOpt > 0) {
int opt[] = new int[bracketOpt + 1];
// Surround first optional terminal with MAYBE
Operation.OpMaybe op = new Operation.OpMaybe();
insertNode(op, pos);
opt[0] = pos;
// Add all the rest optional terminals with preceding MAYBEs
for (int c = 1; c < bracketOpt; c++) {
op = new Operation.OpMaybe();
opt[c] = appendNode(op);
// Rewind stream and run it through again - more matchers coming
idx = idxBeforeTerminal;
terminal(terminalFlags);
}
// Tie ends together
int end = opt[bracketOpt] = appendNode(new Operation.OpNothing());
for (int c = 0; c < bracketOpt; c++) {
setNextOfEnd(opt[c], end);
setNextOfEnd(opt[c] + 1, opt[c + 1]);
}
} else {
// Rollback terminal - no opt matchers present
//lenInstruction = pos;
while (instructions.size() > pos) {
instructions.remove(instructions.size()-1);
}
Operation.OpNothing nothing = new Operation.OpNothing();
appendNode(nothing);
}
// We are done. skip the reminder of {m,n} expr
idx = bracketEnd;
break;
}
case '?': {
Operation.OpMaybe maybe = new Operation.OpMaybe();
insertNode(maybe, ret);
Operation.OpNothing nothing = new Operation.OpNothing();
int n = appendNode(nothing);
setNextOfEnd(ret, n);
setNextOfEnd(ret + 1, n);
break;
}
case '*': {
Operation.OpStar star = new Operation.OpStar();
insertNode(star, ret);
setNextOfEnd(ret + 1, ret);
break;
}
case '+': {
Operation.OpContinue continu = new Operation.OpContinue();
insertNode(continu, ret);
Operation.OpPlus plus = new Operation.OpPlus();
int n = appendNode(plus);
setNextOfEnd(ret + 1, n);
setNextOfEnd(n, ret);
break;
}
}
} else {
// Not greedy (reluctant): Actually do the quantifier now
switch (quantifierType) {
case '?': {
Operation.OpReluctantMaybe reluctantMaybe = new Operation.OpReluctantMaybe();
insertNode(reluctantMaybe, ret);
//nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
int n = appendNode(new Operation.OpNothing());
//int n = node(RE.OP_NOTHING, 0);
setNextOfEnd(ret, n);
setNextOfEnd(ret + 1, n);
break;
}
case '*': {
Operation.OpReluctantStar reluctantStar = new Operation.OpReluctantStar();
insertNode(reluctantStar, ret);
setNextOfEnd(ret + 1, ret);
break;
}
case '+': {
insertNode(new Operation.OpContinue(), ret);
//nodeInsert(RE.OP_CONTINUE, 0, ret);
int n = appendNode(new Operation.OpReluctantPlus());
//int n = node(RE.OP_RELUCTANTPLUS, 0);
setNextOfEnd(n, ret);
setNextOfEnd(ret + 1, n);
break;
}
case '{': {
// reluctant {..}? - added by MHK
//bracket();
int bracketEnd = idx;
int bracketMin = this.bracketMin;
int bracketOpt = this.bracketOpt;
// Pointer to the last terminal
int pos = ret;
// Process min first
for (int c = 0; c < bracketMin; c++) {
// Rewind stream and run it through again - more matchers coming
idx = idxBeforeTerminal;
setNextOfEnd(pos, pos = terminal(terminalFlags));
}
// Do the right thing for maximum ({m,})
if (bracketOpt == bracketUnbounded) {
// Drop through now and quantifier expression.
// We are done with the {m,} expr, so skip rest
idx = bracketEnd;
insertNode(new Operation.OpReluctantStar(), pos);
//nodeInsert(RE.OP_RELUCTANTSTAR, 0, pos);
setNextOfEnd(pos + 1, pos);
break;
} else if (bracketOpt > 0) {
int opt[] = new int[bracketOpt + 1];
// Surround first optional terminal with MAYBE
insertNode(new Operation.OpReluctantMaybe(), pos);
//nodeInsert(RE.OP_RELUCTANTMAYBE, 0, pos);
opt[0] = pos;
// Add all the rest optional terminals with preceeding MAYBEs
for (int c = 1; c < bracketOpt; c++) {
opt[c] = appendNode(new Operation.OpReluctantMaybe());
//opt[c] = node(RE.OP_RELUCTANTMAYBE, 0);
// Rewind stream and run it through again - more matchers coming
idx = idxBeforeTerminal;
terminal(terminalFlags);
}
// Tie ends together
int end = opt[bracketOpt] = appendNode(new Operation.OpNothing());
for (int c = 0; c < bracketOpt; c++) {
setNextOfEnd(opt[c], end);
setNextOfEnd(opt[c] + 1, opt[c + 1]);
}
} else {
// Rollback terminal - no opt matchers present
while (instructions.size() > pos) {
instructions.remove(instructions.size() - 1);
}
appendNode(new Operation.OpNothing());
}
// We are done. skip the reminder of {m,n} expr
idx = bracketEnd;
break;
}
}
}
return ret;
}
/**
* Compile body of one branch of an or operator (implements concatenation)
*
* @param compilerFlags Flags passed by reference
* @return Pointer to first node in the branch
* @throws RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int branch(int[] compilerFlags) throws RESyntaxException {
// Get each possibly qnatified piece and concat
int node;
int ret = -1;
int chain = -1;
int[] quantifierFlags = new int[1];
boolean nullable = true;
while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') {
// Get new node
quantifierFlags[0] = NODE_NORMAL;
node = piece(quantifierFlags);
if (quantifierFlags[0] == NODE_NORMAL) {
nullable = false;
}
// If there's a chain, append to the end
if (chain != -1) {
setNextOfEnd(chain, node);
}
// Chain starts at current
chain = node;
if (ret == -1) {
ret = node;
}
}
// If we don't run loop, make a nothing node
if (ret == -1) {
Operation nothing = new Operation.OpNothing();
ret = appendNode(nothing);
}
// Set nullable flag for this branch
if (nullable) {
compilerFlags[0] |= NODE_NULLABLE;
}
return ret;
}
/**
* Compile an expression with possible parens around it. Paren matching
* is done at this level so we can tie the branch tails together.
*
* @param compilerFlags Flag value passed by reference
* @return Node index of expression in instruction array
* @throws RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int expr(int[] compilerFlags) throws RESyntaxException {
// Create open paren node unless we were called from the top level (which has no parens)
int paren = -1;
int ret = -1;
int closeParens = parens;
if ((compilerFlags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') {
// if its a cluster ( rather than a proper subexpression ie with backrefs )
if (idx + 2 < len && pattern.charAt(idx + 1) == '?' && pattern.charAt(idx + 2) == ':') {
if (!isXPath30) {
syntaxError("Non-capturing groups allowed only in XPath3.0");
}
paren = 2;
idx += 3;
ret = appendNode(new Operation.OpOpenCluster());
} else {
paren = 1;
idx++;
ret = appendNode(new Operation.OpOpen(parens++));
}
}
compilerFlags[0] &= ~NODE_TOPLEVEL;
// Process contents of first branch node
boolean open = false;
int branch = branch(compilerFlags);
if (ret == -1) {
ret = branch;
} else {
setNextOfEnd(ret, branch);
}
// Loop through branches
while (idx < len && pattern.charAt(idx) == '|') {
// Now open the first branch since there are more than one
if (!open) {
Operation.OpBranch op = new Operation.OpBranch();
insertNode(op, branch);
open = true;
}
idx++;
setNextOfEnd(branch, branch = appendNode(new Operation.OpBranch()));
branch(compilerFlags);
}
// Create an ending node (either a close paren or an OP_END)
int end;
if (paren > 0) {
if (idx < len && pattern.charAt(idx) == ')') {
idx++;
} else {
syntaxError("Missing close paren");
}
if (paren == 1) {
end = appendNode(new Operation.OpClose(closeParens));
captures.add(closeParens);
} else {
end = appendNode(new Operation.OpCloseCluster());
}
} else {
end = appendNode(new Operation.OpEndProgram());
}
// Append the ending node to the ret nodelist
setNextOfEnd(ret, end);
// Hook the ends of each branch to the end node
int currentNode = ret;
int nextNodeOffset = instructions.get(currentNode).next;
// while the next node o
while (nextNodeOffset != 0 && currentNode < instructions.size()) {
// If branch, make the end of the branch's operand chain point to the end node.
if (instructions.get(currentNode) instanceof Operation.OpBranch) {
setNextOfEnd(currentNode + 1, end);
}
nextNodeOffset = instructions.get(currentNode).next;
currentNode += nextNodeOffset;
}
// Return the node list
return ret;
}
/**
* Compiles a regular expression pattern into a program runnable by the pattern
* matcher class 'RE'.
*
* @param pattern Regular expression pattern to compile (see RECompiler class
* for details).
* @return A compiled regular expression program.
* @throws RESyntaxException Thrown if the regular expression has invalid syntax.
* @see RECompiler
* @see REMatcher
*/
public REProgram compile(UnicodeString pattern) throws RESyntaxException {
// Initialize variables for compilation
this.pattern = pattern; // Save pattern in instance variable
len = pattern.length(); // Precompute pattern length for speed
idx = 0; // Set parsing index to the first character
parens = 1; // Set paren level to 1 (the implicit outer parens)
boolean nullable = false;
if (reFlags.isLiteral()) {
// 'q' flag is set
int ret = literalAtom();
Operation.OpEndProgram endNode = new Operation.OpEndProgram();
int end = appendNode(endNode);
setNextOfEnd(ret, end);
} else {
if (reFlags.isAllowWhitespace()) {
// 'x' flag is set. Preprocess the expression to strip whitespace, other than between
// square brackets
FastStringBuffer sb = new FastStringBuffer(pattern.length());
int nesting = 0;
boolean astral = false;
boolean escaped = false;
for (int i=0; i<pattern.length(); i++) {
int ch = pattern.charAt(i);
if (ch > 65535) {
astral = true;
}
if (ch == '\\' && !escaped) {
escaped = true;
sb.appendWideChar(ch);
} else if (ch == '[' && !escaped) {
nesting++;
escaped = false;
sb.appendWideChar(ch);
} else if (ch == ']' && !escaped) {
nesting--;
escaped = false;
sb.appendWideChar(ch);
} else if (nesting==0 && Whitespace.isWhitespace(ch)) {
// no action
} else {
escaped = false;
sb.appendWideChar(ch);
}
}
if (astral) {
pattern = new GeneralUnicodeString(sb);
} else {
pattern = new BMPString(sb);
}
this.pattern = pattern;
this.len = pattern.length();
}
// Initialize pass by reference flags value
int[] compilerFlags = {NODE_TOPLEVEL};
// Parse expression
expr(compilerFlags);
nullable = (compilerFlags[0] & NODE_NULLABLE) != 0;
// Should be at end of input
if (idx != len) {
if (pattern.charAt(idx) == ')') {
syntaxError("Unmatched close paren");
}
syntaxError("Unexpected input remains");
}
}
// Return the result
Operation[] ops = new Operation[instructions.size()];
for (int i=0; i<instructions.size(); i++) {
// convert relative offsets in "next" pointer to absolute offsets (with -1 meaning null)
Operation op = instructions.get(i);
if (op.next == 0) {
op.next = -1;
} else {
op.next += i;
}
ops[i] = op;
}
REProgram program = new REProgram(ops, parens, reFlags);
if (reFlags.isDebug()) {
program.display(System.err);
//throw new AssertionError("terminated by request");
}
program.setNullable(nullable);
return program;
}
/**
* Process a "regular expression" with the q flag set. This is simply handled as an atom, where
* no characters are treated as special (i.e. all are treated as if escaped)
*
* @return Index of new atom node
*/
int literalAtom() {
// Create a string node
Operation.OpAtom node = new Operation.OpAtom();
node.atom = pattern;
return appendNode(node);
}
///////////////////////////////////////////////////////////////////////////////////////////////
// DIAGNOSTIC CODE
///////////////////////////////////////////////////////////////////////////////////////////////
/**
* Return a string describing a (possibly unprintable) character.
*
* @param c Character to convert to a printable representation
* @return String representation of character
*/
String charToString(char c) {
// If it's unprintable, convert to '\###'
if (c < ' ' || c > 127) {
return "\\" + (int) c;
}
// Return the character as a string
return String.valueOf(c);
}
}
// This class is derived from the Apache Jakarta project, with substantial
// modifications by Saxonica to make the regular expression dialect conform
// with XPath 2.0 specifications.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/