// Copyright 2010 NexJ Systems Inc. This software is licensed under the terms of the Eclipse Public License 1.0
package nexj.core.scripting.match;
import java.io.StringReader;
import nexj.core.scripting.GenericParser;
import nexj.core.scripting.Pair;
import nexj.core.scripting.ParserException;
import nexj.core.scripting.Symbol;
import nexj.core.util.TextPosition;
import nexj.core.util.TextPositionHolder;
/**
* Class for parsing "match" operator string expressions into Pair trees.
*/
public class ExpressionParser extends GenericParser
{
/**
* Token denoting an AND operator.
*/
private static final int TOKEN_AND = 1;
/**
* Token denoting an FUZZY operator.
*/
private static final int TOKEN_FUZZY = 2;
/**
* Token denoting an NOT operator.
*/
private static final int TOKEN_NOT = 3;
/**
* Token denoting an OR operator.
*/
private static final int TOKEN_OR = 4;
/**
* Token denoting a value.
*/
private static final int TOKEN_VALUE = 5;
/**
* Token denoting an WEIGHT operator.
*/
private static final int TOKEN_WEIGHT = 6;
/**
* Is parser currently inside a subexpression, used for exception generation only.
*/
private boolean m_bSubexpression;
/**
* Current read offset, used for exception generation.
*/
private int m_nOffset;
/**
* Constructor.
*/
public ExpressionParser()
{
super(null); // do not use global environment
}
/**
* Parse a string into a Pair expression tree (not thread safe).
* @param sExpression The expression to parse.
* @return Parsed expression tree.
*/
public Pair parse(String sExpression) throws ParserException
{
if (sExpression == null)
{
return null;
}
m_tokenBuf.setLength(0); // reset for new string
m_bSubexpression = false; // reset for new string
m_nOffset = 0; // reset for new string
Object value;
try
{
value = parse(new StringReader(sExpression), null);
}
catch (RuntimeException e) // common parent of NumberFormatException & ParserException
{
TextPosition pos = (e instanceof TextPositionHolder) ? ((TextPositionHolder)e).getTextPosition() : getCurTokenPos();
throw new ParserException("err.scripting.invalidMatchExpression", new Object[]
{
sExpression
}, e, pos);
}
finally
{
m_tokenBuf.setLength(0); // free memory
}
return (value instanceof Pair || value == null) ? (Pair)value : new Pair(value);
}
/**
* @see nexj.core.scripting.GenericParser#parseElement()
*/
protected Object parseElement()
{
Object value = parseOr();
if (getCurToken() != TOKEN_EOF) // some sort of parse exception (usually missing operator)
{
fail(true);
}
return value;
}
/**
* Parse a single atom.
* @return an atomic value.
*/
protected Object parseAtom()
{
int nValueType = getCurToken();
if (nValueType == TOKEN_EOF)
{
return null;
}
Object value = m_tokenValue; // either a String or subexpression e.g. '("a")'
// only triggered if it's the first exclusion of the expression
if (nValueType == TOKEN_NOT) // have a unary NOT
{
if (getNextToken() == TOKEN_EOF) // get token after done with current buffer
{
fail(true); // invalid to not have RHS value
}
value = Pair.list(Symbol.NOT, m_tokenValue);
}
else if (nValueType == TOKEN_FUZZY) // have a unary FUZZY
{
// get token after done with current buffer (a next value is required)
if (getNextToken() == TOKEN_EOF || !(m_tokenValue instanceof String))
{
fail(true); // invalid to not have RHS value
}
value = Pair.list(Symbol.LIKE_P, m_tokenValue);
} // else have literal value
forgetToken(); // token consumed
return value;
}
/**
* Parse atoms ANDed together.
* @param value A value that should be consumed before the next input (null if none).
* @return an atomic value.
*/
protected Object parseAnd(Object value)
{
if (value == null) // only consume next value if none provided
{
value = parseWeight();
}
while(getCurToken() == TOKEN_AND || getCurToken() == TOKEN_NOT)
{
if (getCurToken() == TOKEN_AND)
{
forgetToken(); // AND token consumed
}
Object next = parseWeight();
if (next == null) // invalid to end expression with "and"
{
fail(true);
}
if (!(value instanceof Pair) || ((Pair)value).getHead() != Symbol.AND)
{
value = Pair.list(Symbol.AND, value); // do not have an AND on left
}
// have an AND list or some other sort of expression/value
value = Pair.append((Pair)value,
(next instanceof Pair && ((Pair)next).getHead() == Symbol.AND)
? ((Pair)next).getNext() : new Pair(next));
}
return value;
}
/**
* Parse atoms ORed together (possibly join multiple WEIGHT atoms).
* @return an atomic value.
*/
protected Object parseOr()
{
Object value = parseAnd(null);
while(getCurToken() == TOKEN_OR)
{
forgetToken(); // OR token consumed
Object next = parseAnd(null); // NOTE: curToken now set to next operator by parseAnd()
if (next == null) // invalid to end expression with "or"
{
fail(true);
}
// join sequential WEIGHT statements separated by OR into a single statement
// used in jUnit tests for testing WEIGHT evaluation
if (next instanceof Pair && ((Pair)next).getHead() == Symbol.MUL && value instanceof Pair)
{
if (((Pair)value).getHead() == Symbol.MUL)
{
//have a WEIGHT list to append to, push-back concatenated and get actual first value
value = parseAnd(Pair.append((Pair)value, ((Pair)next).getNext()));
continue;
}
if (((Pair)value).getHead() == Symbol.OR) // check if last element of OR list is WEIGHT
{
Pair last = (Pair)value;
while (last.getTail() != null)
{
last = last.getNext();
}
if (last.getHead() instanceof Pair && // have a WEIGHT list to append to
((Pair)last.getHead()).getHead() == Symbol.MUL)
{
last.setHead(parseAnd(Pair.append((Pair)last.getHead(), // push-back concatenated
((Pair)next).getNext()))); // then set as head
continue;
}
}
}
// else regular OR expression
if (!(value instanceof Pair) || ((Pair)value).getHead() != Symbol.OR)
{
value = Pair.list(Symbol.OR, value); // do not have an OR on left
}
// have an OR list or some other sort of expression/value
value = Pair.append((Pair)value,
(next instanceof Pair && ((Pair)next).getHead() == Symbol.OR)
? ((Pair)next).getNext() : new Pair(next));
}
return value;
}
/**
* Parse atom with a weight.
* @return an atomic value.
*/
protected Object parseWeight()
{
Object value = parseAtom();
if (getCurToken() == TOKEN_WEIGHT)
{
// get token after done with current buffer (a next value is required)
if (getNextToken() == TOKEN_EOF || !(m_tokenValue instanceof String))
{
fail(true); // invalid to not have a RHS value
}
Double weight = new Double(m_tokenValue.toString());
// do not have a WEIGHT on left
if (!(value instanceof Pair) || ((Pair)value).getHead() != Symbol.MUL)
{
value = Pair.list(Symbol.MUL, Pair.list(weight, value));
}
forgetToken(); // token consumed
}
return value;
}
/**
* @see nexj.core.scripting.GenericParser#parseToken()
*/
protected int parseToken()
{
boolean bQuoted = false;
m_tokenBuf.setLength(0);
while(getCurChar() != CHAR_EOF && Character.isWhitespace(getCurChar()))
{
forgetChar();
++m_nOffset;
}
m_nTokenColumn = m_nOffset; // mark current token start position
while(getCurChar() != CHAR_EOF && (bQuoted || !Character.isWhitespace(getCurChar())))
{
switch (getCurChar())
{
case '"':
if (!bQuoted)
{
if (m_tokenBuf.length() > 0) // if have something before quoted literal
{
m_tokenValue = m_tokenBuf.toString();
return valueType((String)m_tokenValue); // will pick up '"' char next time
}
bQuoted = true;
forgetChar();
++m_nOffset;
continue;
}
forgetChar();
++m_nOffset;
m_tokenValue = m_tokenBuf.toString();
return TOKEN_VALUE; // finished literal processing (can be empty)
case ')':
if (!m_bSubexpression)
{
fail(false); // unmatched closing parenthesis
}
m_tokenValue = m_tokenBuf.toString();
// generate subexpression EOF on end of subexpression
return (m_tokenBuf.length() > 0) ? valueType((String)m_tokenValue) : TOKEN_EOF;
case '(':
int nOffset = m_nTokenColumn; // note starting position of atom
boolean bSubexpression = m_bSubexpression; //remember current state
forgetChar();
++m_nOffset;
m_bSubexpression = true; // expect a closing parenthesis
m_tokenValue = parseElement();
m_nTokenColumn = nOffset; // note starting position for exception (if any)
if (getCurChar() != ')')
{
fail(false); // unmatched opening parenthesis
}
m_bSubexpression = bSubexpression; // clear modified state
m_nToken = 0; // reset for getCurToken() to work
forgetChar();
++m_nOffset;
return TOKEN_VALUE;
case '!': // fall through
case '&': // fall through
case '*': // fall through
case '-': // fall through
case '|': // fall through
case '~':
if (m_tokenBuf.length() == 0)
{
m_tokenBuf.append((char)getCurChar());
forgetChar();
++m_nOffset;
}
m_tokenValue = m_tokenBuf.toString();
return valueType((String)m_tokenValue);
}
m_tokenBuf.append((char)getCurChar());
forgetChar();
++m_nOffset;
}
if (bQuoted)
{
fail(false); // quoted literals should have been closed
}
m_tokenValue = m_tokenBuf.toString();
return (m_tokenBuf.length() > 0) ? valueType((String)m_tokenValue) : TOKEN_EOF;
}
/**
* Determine the type of the value stored in the buffer.
* @param sVlaue The value to examine.
* @return One of the TOKEN_* constants.
*/
protected static int valueType(String sValue)
{
if (("|".length() == sValue.length() && "|".equals(sValue)) ||
(Symbol.OR.getName().length() == sValue.length() && // MatchNode.OR
Symbol.OR.getName().equals(sValue)))
{
return TOKEN_OR;
}
if (("&".length() == sValue.length() && "&".equals(sValue)) ||
(Symbol.AND.getName().length() == sValue.length() && // MatchNode.AND
Symbol.AND.getName().equals(sValue)))
{
return TOKEN_AND;
}
if (("!".length() == sValue.length() && "!".equals(sValue)) ||
("-".length() == sValue.length() && "-".equals(sValue)) ||
(Symbol.NOT.getName().length() == sValue.length() && // MatchNode.NOT
Symbol.NOT.getName().equals(sValue)))
{
return TOKEN_NOT;
}
if (("*".length() == sValue.length() && "*".equals(sValue)) ||
(Symbol.MUL.getName().length() == sValue.length() && // MatchNode.WEIGHT
Symbol.MUL.getName().equals(sValue)))
{
return TOKEN_WEIGHT;
}
if (("~".length() == sValue.length() && "~".equals(sValue)) ||
(Symbol.LIKE_P.getName().length() == sValue.length() && // MatchNode.FUZZY
Symbol.LIKE_P.getName().equals(sValue)))
{
return TOKEN_FUZZY;
}
return TOKEN_VALUE;
}
/**
* Convenience method to abort parsing.
*
* @param bToken Whether the position should be reported as token or text
* position.
* @throws ParserException
*/
protected final void fail(boolean bToken) throws ParserException
{
TextPosition pos = (bToken) ? null : getCurTextPosition();
if (pos == null)
{
// Never returns null.
pos = getCurTokenPos();
}
throw new ParserException("err.scripting.matchExpressionParsing", null, null, pos);
}
}