// Copyright 2010 NexJ Systems Inc. This software is licensed under the terms of the Eclipse Public License 1.0
package nexj.core.scripting.match;
import java.io.StringReader;
import nexj.core.scripting.GenericParser;
import nexj.core.scripting.Pair;
import nexj.core.scripting.ParserException;
import nexj.core.scripting.Symbol;
import nexj.core.util.TextPosition;
import nexj.core.util.TextPositionHolder;
* Class for parsing "match" operator string expressions into Pair trees.
public class ExpressionParser extends GenericParser
* Token denoting an AND operator.
private static final int TOKEN_AND = 1;
* Token denoting an FUZZY operator.
private static final int TOKEN_FUZZY = 2;
* Token denoting an NOT operator.
private static final int TOKEN_NOT = 3;
* Token denoting an OR operator.
private static final int TOKEN_OR = 4;
* Token denoting a value.
private static final int TOKEN_VALUE = 5;
* Token denoting an WEIGHT operator.
private static final int TOKEN_WEIGHT = 6;
* Is parser currently inside a subexpression, used for exception generation only.
private boolean m_bSubexpression;
* Current read offset, used for exception generation.
private int m_nOffset;
* Constructor.
public ExpressionParser()
super(null); // do not use global environment
* Parse a string into a Pair expression tree (not thread safe).
* @param sExpression The expression to parse.
* @return Parsed expression tree.
public Pair parse(String sExpression) throws ParserException
if (sExpression == null)
return null;
m_tokenBuf.setLength(0); // reset for new string
m_bSubexpression = false; // reset for new string
m_nOffset = 0; // reset for new string
Object value;
value = parse(new StringReader(sExpression), null);
catch (RuntimeException e) // common parent of NumberFormatException & ParserException
TextPosition pos = (e instanceof TextPositionHolder) ? ((TextPositionHolder)e).getTextPosition() : getCurTokenPos();
throw new ParserException("err.scripting.invalidMatchExpression", new Object[]
}, e, pos);
m_tokenBuf.setLength(0); // free memory
return (value instanceof Pair || value == null) ? (Pair)value : new Pair(value);
* @see nexj.core.scripting.GenericParser#parseElement()
protected Object parseElement()
Object value = parseOr();
if (getCurToken() != TOKEN_EOF) // some sort of parse exception (usually missing operator)
return value;
* Parse a single atom.
* @return an atomic value.
protected Object parseAtom()
int nValueType = getCurToken();
if (nValueType == TOKEN_EOF)
return null;
Object value = m_tokenValue; // either a String or subexpression e.g. '("a")'
// only triggered if it's the first exclusion of the expression
if (nValueType == TOKEN_NOT) // have a unary NOT
if (getNextToken() == TOKEN_EOF) // get token after done with current buffer
fail(true); // invalid to not have RHS value
value = Pair.list(Symbol.NOT, m_tokenValue);
else if (nValueType == TOKEN_FUZZY) // have a unary FUZZY
// get token after done with current buffer (a next value is required)
if (getNextToken() == TOKEN_EOF || !(m_tokenValue instanceof String))
fail(true); // invalid to not have RHS value
value = Pair.list(Symbol.LIKE_P, m_tokenValue);
} // else have literal value
forgetToken(); // token consumed
return value;
* Parse atoms ANDed together.
* @param value A value that should be consumed before the next input (null if none).
* @return an atomic value.
protected Object parseAnd(Object value)
if (value == null) // only consume next value if none provided
value = parseWeight();
while(getCurToken() == TOKEN_AND || getCurToken() == TOKEN_NOT)
if (getCurToken() == TOKEN_AND)
forgetToken(); // AND token consumed
Object next = parseWeight();
if (next == null) // invalid to end expression with "and"
if (!(value instanceof Pair) || ((Pair)value).getHead() != Symbol.AND)
value = Pair.list(Symbol.AND, value); // do not have an AND on left
// have an AND list or some other sort of expression/value
value = Pair.append((Pair)value,
(next instanceof Pair && ((Pair)next).getHead() == Symbol.AND)
? ((Pair)next).getNext() : new Pair(next));
return value;
* Parse atoms ORed together (possibly join multiple WEIGHT atoms).
* @return an atomic value.
protected Object parseOr()
Object value = parseAnd(null);
while(getCurToken() == TOKEN_OR)
forgetToken(); // OR token consumed
Object next = parseAnd(null); // NOTE: curToken now set to next operator by parseAnd()
if (next == null) // invalid to end expression with "or"
// join sequential WEIGHT statements separated by OR into a single statement
// used in jUnit tests for testing WEIGHT evaluation
if (next instanceof Pair && ((Pair)next).getHead() == Symbol.MUL && value instanceof Pair)
if (((Pair)value).getHead() == Symbol.MUL)
//have a WEIGHT list to append to, push-back concatenated and get actual first value
value = parseAnd(Pair.append((Pair)value, ((Pair)next).getNext()));
if (((Pair)value).getHead() == Symbol.OR) // check if last element of OR list is WEIGHT
Pair last = (Pair)value;
while (last.getTail() != null)
last = last.getNext();
if (last.getHead() instanceof Pair && // have a WEIGHT list to append to
((Pair)last.getHead()).getHead() == Symbol.MUL)
last.setHead(parseAnd(Pair.append((Pair)last.getHead(), // push-back concatenated
((Pair)next).getNext()))); // then set as head
// else regular OR expression
if (!(value instanceof Pair) || ((Pair)value).getHead() != Symbol.OR)
value = Pair.list(Symbol.OR, value); // do not have an OR on left
// have an OR list or some other sort of expression/value
value = Pair.append((Pair)value,
(next instanceof Pair && ((Pair)next).getHead() == Symbol.OR)
? ((Pair)next).getNext() : new Pair(next));
return value;
* Parse atom with a weight.
* @return an atomic value.
protected Object parseWeight()
Object value = parseAtom();
if (getCurToken() == TOKEN_WEIGHT)
// get token after done with current buffer (a next value is required)
if (getNextToken() == TOKEN_EOF || !(m_tokenValue instanceof String))
fail(true); // invalid to not have a RHS value
Double weight = new Double(m_tokenValue.toString());
// do not have a WEIGHT on left
if (!(value instanceof Pair) || ((Pair)value).getHead() != Symbol.MUL)
value = Pair.list(Symbol.MUL, Pair.list(weight, value));
forgetToken(); // token consumed
return value;
* @see nexj.core.scripting.GenericParser#parseToken()
protected int parseToken()
boolean bQuoted = false;
while(getCurChar() != CHAR_EOF && Character.isWhitespace(getCurChar()))
m_nTokenColumn = m_nOffset; // mark current token start position
while(getCurChar() != CHAR_EOF && (bQuoted || !Character.isWhitespace(getCurChar())))
switch (getCurChar())
case '"':
if (!bQuoted)
if (m_tokenBuf.length() > 0) // if have something before quoted literal
m_tokenValue = m_tokenBuf.toString();
return valueType((String)m_tokenValue); // will pick up '"' char next time
bQuoted = true;
m_tokenValue = m_tokenBuf.toString();
return TOKEN_VALUE; // finished literal processing (can be empty)
case ')':
if (!m_bSubexpression)
fail(false); // unmatched closing parenthesis
m_tokenValue = m_tokenBuf.toString();
// generate subexpression EOF on end of subexpression
return (m_tokenBuf.length() > 0) ? valueType((String)m_tokenValue) : TOKEN_EOF;
case '(':
int nOffset = m_nTokenColumn; // note starting position of atom
boolean bSubexpression = m_bSubexpression; //remember current state
m_bSubexpression = true; // expect a closing parenthesis
m_tokenValue = parseElement();
m_nTokenColumn = nOffset; // note starting position for exception (if any)
if (getCurChar() != ')')
fail(false); // unmatched opening parenthesis
m_bSubexpression = bSubexpression; // clear modified state
m_nToken = 0; // reset for getCurToken() to work
case '!': // fall through
case '&': // fall through
case '*': // fall through
case '-': // fall through
case '|': // fall through
case '~':
if (m_tokenBuf.length() == 0)
m_tokenValue = m_tokenBuf.toString();
return valueType((String)m_tokenValue);
if (bQuoted)
fail(false); // quoted literals should have been closed
m_tokenValue = m_tokenBuf.toString();
return (m_tokenBuf.length() > 0) ? valueType((String)m_tokenValue) : TOKEN_EOF;
* Determine the type of the value stored in the buffer.
* @param sVlaue The value to examine.
* @return One of the TOKEN_* constants.
protected static int valueType(String sValue)
if (("|".length() == sValue.length() && "|".equals(sValue)) ||
(Symbol.OR.getName().length() == sValue.length() && // MatchNode.OR
return TOKEN_OR;
if (("&".length() == sValue.length() && "&".equals(sValue)) ||
(Symbol.AND.getName().length() == sValue.length() && // MatchNode.AND
return TOKEN_AND;
if (("!".length() == sValue.length() && "!".equals(sValue)) ||
("-".length() == sValue.length() && "-".equals(sValue)) ||
(Symbol.NOT.getName().length() == sValue.length() && // MatchNode.NOT
return TOKEN_NOT;
if (("*".length() == sValue.length() && "*".equals(sValue)) ||
(Symbol.MUL.getName().length() == sValue.length() && // MatchNode.WEIGHT
if (("~".length() == sValue.length() && "~".equals(sValue)) ||
(Symbol.LIKE_P.getName().length() == sValue.length() && // MatchNode.FUZZY
* Convenience method to abort parsing.
* @param bToken Whether the position should be reported as token or text
* position.
* @throws ParserException
protected final void fail(boolean bToken) throws ParserException
TextPosition pos = (bToken) ? null : getCurTextPosition();
if (pos == null)
// Never returns null.
pos = getCurTokenPos();
throw new ParserException("err.scripting.matchExpressionParsing", null, null, pos);