/*
* $Id: Tokenizer.java,v 1.43 2002/09/16 08:05:06 jkl Exp $
*
* Copyright (c) 2002 Njet Communications Ltd. All Rights Reserved.
*
* Use is subject to license terms, as defined in
* Anvil Sofware License, Version 1.1. See LICENSE
* file, or http://njet.org/license-1.1.txt
*/
package anvil.script.parser;
import anvil.java.io.GenericInputStream;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.io.File;
import java.util.Hashtable;
public class Tokenizer implements TokenManager, ParserBaseConstants
{
public static final int COMMENT = 1000;
public static final int DOC_COMMENT = 1001;
public static final int WHITESPACE = 1002;
private static Hashtable _symbols = new Hashtable();
static {
_symbols.put("as", new Integer(AS));
_symbols.put("assert", new Integer(ASSERT));
_symbols.put("boolean", new Integer(BOOLEAN));
_symbols.put("break", new Integer(BREAK));
_symbols.put("case", new Integer(CASE));
_symbols.put("catch", new Integer(CATCH));
_symbols.put("class", new Integer(CLASS));
_symbols.put("classof", new Integer(CLASSOF));
_symbols.put("const", new Integer(CONST));
_symbols.put("copyof", new Integer(COPYOF));
_symbols.put("cloneof", new Integer(CLONEOF));
_symbols.put("continue", new Integer(CONTINUE));
_symbols.put("default", new Integer(_DEFAULT));
_symbols.put("defined", new Integer(DEFINED));
_symbols.put("delete", new Integer(DELETE));
_symbols.put("do", new Integer(DO));
_symbols.put("else", new Integer(ELSE));
_symbols.put("extends", new Integer(EXTENDS));
_symbols.put("exit", new Integer(EXIT));
_symbols.put("false", new Integer(FALSE));
_symbols.put("finally", new Integer(FINALLY));
_symbols.put("float", new Integer(FLOAT));
_symbols.put("foreach", new Integer(FOREACH));
_symbols.put("for", new Integer(FOR));
_symbols.put("function", new Integer(FUNCTION));
_symbols.put("if", new Integer(IF));
_symbols.put("implements", new Integer(IMPLEMENTS));
_symbols.put("import", new Integer(IMPORT));
_symbols.put("in", new Integer(IN));
_symbols.put("inf", new Integer(INF));
_symbols.put("interface", new Integer(INTERFACE));
_symbols.put("int", new Integer(INT));
_symbols.put("is", new Integer(IS));
_symbols.put("has", new Integer(HAS));
_symbols.put("module", new Integer(MODULE));
_symbols.put("namespace", new Integer(NAMESPACE));
_symbols.put("new", new Integer(NEW));
_symbols.put("null", new Integer(NULL));
_symbols.put("println", new Integer(PRINTLN));
_symbols.put("printbr", new Integer(PRINTBR));
_symbols.put("print", new Integer(PRINT));
_symbols.put("return", new Integer(RETURN));
_symbols.put("sizeof", new Integer(SIZEOF));
_symbols.put("string", new Integer(STRING));
_symbols.put("static", new Integer(STATIC));
_symbols.put("super", new Integer(SUPER));
_symbols.put("switch", new Integer(SWITCH));
_symbols.put("synchronized", new Integer(SYNCHRONIZED));
_symbols.put("this", new Integer(THIS));
_symbols.put("throw", new Integer(THROW));
_symbols.put("true", new Integer(TRUE));
_symbols.put("try", new Integer(TRY));
_symbols.put("typeof", new Integer(TYPEOF));
_symbols.put("undefined", new Integer(UNDEFINED));
_symbols.put("var", new Integer(VAR));
_symbols.put("while", new Integer(WHILE));
_symbols.put("yield", new Integer(YIELD));
}
private ParserBase _parser = null;
private GenericInputStream _input;
private int _beginColumn = 0;
private int _beginLine = 0;
private int _backup = -1;
private boolean _allowPattern = false;
private StringBuffer _image = new StringBuffer(64);
private boolean _returnIgnorable;
private Token _previous = null;
public Tokenizer(byte[] array)
{
_input = new GenericInputStream(array);
}
public Tokenizer(InputStream input)
{
_input = new GenericInputStream(input);
}
public Tokenizer(InputStream input, boolean returnIgnorable)
{
this(input);
_returnIgnorable = returnIgnorable;
}
public Tokenizer(String code)
{
_input = new GenericInputStream(new ByteArrayInputStream(
anvil.util.Conversions.getBytes(code)));
}
public void setParser(ParserBase parser)
{
_parser = parser;
}
private Token eof()
{
Token token = new Token();
token.beginLine = _beginLine;
token.beginColumn = _beginColumn;
token.endLine = _beginLine;
token.endColumn = _beginColumn;
token.kind = 0;
return token;
}
private Token createToken()
{
Token token = new Token();
token.beginLine = _beginLine;
token.beginColumn = _beginColumn;
return token;
}
private Token createToken(int kind)
{
Token token = new Token();
token.kind = kind;
token.beginLine = _beginLine;
token.beginColumn = _beginColumn;
return token;
}
private Token createToken(int kind, String image)
{
_allowPattern = true;
Token token = new Token();
token.kind = kind;
token.image = image;
token.beginLine = _beginLine;
token.beginColumn = _beginColumn;
token.endLine = _beginLine;
token.endColumn = _beginColumn + image.length() - 1;
return token;
}
private Token createTokenNP(int kind, String image)
{
Token t = createToken(kind, image);
_allowPattern = false;
return t;
}
private void backup(int ch, int line, int column)
{
_beginLine = line;
_beginColumn = column;
_backup = ch;
}
private Token skipToEndOfComment() throws IOException
{
GenericInputStream input = _input;
int ch;
StringBuffer image = _image;
image.setLength(0);
image.append("/*");
boolean star = false;
for(;;) {
ch = input.read();
if (ch == -1) {
_parser.error(_parser.toLocation(input.getLineNumber(),
input.getColumnNumber()), "Unexcepted end of comment");
break;
} else if (ch == '*') {
star = true;
} else if (ch == '/') {
if (star) {
image.append('/');
break;
} else {
star = false;
}
} else {
star = false;
}
image.append((char)ch);
}
String s = image.toString();
Token c = createToken((s.length() > 4 && s.startsWith("/**")) ?
DOC_COMMENT : COMMENT);
c.image = s;
c.endLine = input.getLineNumber();
c.endColumn = input.getColumnNumber() -1;
return c;
}
private Token skipToEndOfLineComment(String start) throws IOException
{
String comment = _input.readLine();
return createToken(COMMENT, start + comment + '\n');
}
private Token readSymbol(int ch) throws IOException
{
_allowPattern = false;
Token token = createToken(SYMBOL);
GenericInputStream input = _input;
StringBuffer image = _image;
int line;
int column;
boolean doLookup = (ch > 0);
image.setLength(0);
if (doLookup) {
image.append((char)ch);
} else {
token.beginColumn++;
}
for(;;) {
line = input.getLineNumber();
column = input.getColumnNumber();
ch = input.read();
if (ch == -1) {
break;
} else if (ch == '$') {
backup(ch, line, column);
break;
} else if (Character.isJavaIdentifierPart((char)ch)) {
image.append((char)ch);
} else {
backup(ch, line, column);
break;
}
}
token.image = image.toString();
if (token.image.length() == 0) {
_parser.error(_parser.toLocation(input.getLineNumber(),
input.getColumnNumber()), "Empty symbols not allowed");
} else {
if (doLookup) {
Integer kind = (Integer)_symbols.get(token.image);
if (kind != null) {
token.kind = kind.intValue();
}
}
}
token.endLine = line;
token.endColumn = column - 1;
return token;
}
private Token readLongString(Token token) throws IOException
{
GenericInputStream input = _input;
StringBuffer image = _image;
boolean escaped = false;
int state = 0;
int line;
int column;
int ch;
image.setLength(0);
image.append('"');
finished:
for(;;) {
line = input.getLineNumber();
column = input.getColumnNumber();
ch = input.read();
switch(ch) {
case -1:
if (state > 0) {
while(state-->0) {
image.append('"');
}
}
_parser.error(_parser.toLocation(line, column),
"Unexcepted end of string literal");
break finished;
case '\\':
while(state > 0) {
image.append('"');
state--;
}
escaped = !escaped;
image.append('\\');
break;
case '"':
if (!escaped) {
if (++state == 3) {
break finished;
}
break;
}
default:
while(state > 0) {
image.append('"');
state--;
}
escaped = false;
image.append((char)ch);
break;
}
}
image.append('"');
token.image = image.toString();
token.endLine = line;
token.endColumn = column;
return token;
}
private Token readString(int stringquote) throws IOException
{
_allowPattern = false;
Token token = createToken(STRING_LITERAL);
GenericInputStream input = _input;
StringBuffer image = _image;
int line;
int column;
int quote = stringquote;
int ch = stringquote;
boolean escaped = false;
image.setLength(0);
image.append((char)ch);
finished:
for(;;) {
line = input.getLineNumber();
column = input.getColumnNumber();
switch(ch = input.read()) {
case -1:
_parser.error(_parser.toLocation(line, column),
"Unexcepted end of string literal");
break finished;
case '\\':
escaped = !escaped;
image.append('\\');
break;
case '\n':
case '\r':
_parser.error(_parser.toLocation(line, column),
"String literal cannot span over lines");
break finished;
default:
image.append((char)ch);
if (!escaped) {
if (ch == quote) {
break finished;
}
}
escaped = false;
break;
}
}
String img = image.toString();
if (img.equals("\"\"")) {
ch = input.read();
if (ch == '"') {
return readLongString(token);
} else {
_backup = ch;
}
}
token.image = image.toString();
token.endLine = line;
token.endColumn = column;
return token;
}
private static final int STATE_START = 0;
private static final int STATE_DECIMAL = 1;
private static final int STATE_HEX = 2;
private static final int STATE_BINARY = 3;
private static final int STATE_FRACTION_START = 4;
private static final int STATE_FRACTION = 5;
private static final int STATE_EXPONENT_START = 6;
private static final int STATE_EXPONENT = 7;
private Token readNumber(int ch) throws IOException
{
_allowPattern = false;
Token token = createToken();
GenericInputStream input = _input;
StringBuffer image = _image;
int line;
int column;
int state;
int kind = INTEGER_LITERAL;
image.setLength(0);
image.append((char)ch);
if (ch == '0') {
state = STATE_START;
} else {
state = STATE_DECIMAL;
}
finished:
for(;;) {
line = input.getLineNumber();
column = input.getColumnNumber();
ch = input.read();
if (ch == -1) {
break;
}
switch(state) {
case STATE_START:
switch(ch) {
case 'x':
case 'X':
state = STATE_HEX;
break;
case 'b':
case 'B':
state = STATE_BINARY;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
state = STATE_DECIMAL;
break;
case '.':
state = STATE_FRACTION_START;
kind = FLOATING_POINT_LITERAL;
break;
default:
break finished;
}
break;
case STATE_DECIMAL:
switch(ch) {
case '.':
state = STATE_FRACTION_START;
kind = FLOATING_POINT_LITERAL;
break;
case 'e':
case 'E':
state = STATE_EXPONENT_START;
kind = FLOATING_POINT_LITERAL;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
break;
default:
break finished;
}
break;
case STATE_HEX:
switch(ch) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
break;
default:
if (Character.isJavaIdentifierPart((char)ch)) {
_parser.error(_parser.toLocation(line, column),
"Invalid hexadecimal number");
}
break finished;
}
break;
case STATE_BINARY:
switch(ch) {
case '0': case '1':
break;
default:
if (Character.isJavaIdentifierPart((char)ch)) {
_parser.error(_parser.toLocation(line, column),
"Invalid binary number");
}
break finished;
}
break;
case STATE_FRACTION_START:
if (ch == '.') {
Token range = new Token();
range.kind = RANGE;
range.image = "..";
range.beginLine = line;
range.beginColumn = column - 1;
range.endLine = line;
range.endColumn = column;
token.next = range;
line = input.getLineNumber();
column = input.getColumnNumber();
ch = input.read();
kind = INTEGER_LITERAL;
image.setLength(image.length() - 1);
break finished;
} else if (Character.isJavaIdentifierStart((char)ch)) {
Token dot = new Token();
dot.kind = DOT;
dot.image = ".";
dot.beginLine = line;
dot.beginColumn = column - 1;
dot.endLine = line;
dot.endColumn = column - 1;
token.next = dot;
kind = INTEGER_LITERAL;
image.setLength(image.length() - 1);
break finished;
}
state = STATE_FRACTION;
case STATE_FRACTION:
switch(ch) {
case 'E':
case 'e':
state = STATE_EXPONENT_START;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
break;
default:
break finished;
}
break;
case STATE_EXPONENT_START:
if (ch == '-' || ch == '+') {
state = STATE_EXPONENT;
image.append((char)ch);
continue;
}
case STATE_EXPONENT:
switch(ch) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
break;
default:
break finished;
}
}
image.append((char)ch);
}
backup(ch, line, column);
switch(state) {
case STATE_BINARY:
if (image.length() < 3) {
_parser.error(_parser.toLocation(line, column),
"Unexpected end of binary number");
}
break;
case STATE_HEX:
if (image.length() < 3) {
_parser.error(_parser.toLocation(line, column),
"Unexpected end of hexadecimal number");
}
break;
case STATE_FRACTION_START:
case STATE_FRACTION:
case STATE_EXPONENT_START:
case STATE_EXPONENT:
char c = image.charAt(image.length() -1);
if (c == 'e' || c == 'E' || c == '-' || c == '+' || c == '.') {
_parser.error(_parser.toLocation(line, column),
"Unexpected end of floating point number");
}
}
token.kind = kind;
token.image = image.toString();
token.endLine = line;
token.endColumn = column - 1;
return token;
}
private Token readPattern(int ch) throws IOException
{
_allowPattern = false;
Token token = createToken(PATTERN);
GenericInputStream input = _input;
StringBuffer image = _image;
boolean escaped = false;
boolean brackets = false;
boolean finished = false;
int line = input.getLineNumber();
int column = input.getColumnNumber();
image.setLength(0);
image.append('/');
finished:
for(;;) {
if (finished) {
if (ch == -1) {
column--;
break finished;
} else if (Character.isJavaIdentifierPart((char)ch)) {
_image.append((char)ch);
} else {
backup(ch, line, column);
column--;
break finished;
}
} else {
switch(ch) {
case -1:
_parser.error(_parser.toLocation(line, column),
"Unexpected end of pattern");
break finished;
case '\\':
escaped = !escaped;
image.append('\\');
break;
case '[':
if (!escaped) {
brackets = true;
}
escaped = false;
image.append('[');
break;
case ']':
if (!escaped) {
brackets = false;
}
escaped = false;
image.append(']');
break;
case '/':
if (!escaped && !brackets) {
finished = true;
}
escaped = false;
image.append('/');
break;
/*case '\n':
case '\r':
_parser.error(_parser.toLocation(line, column),
"Pattern cannot span over lines");
break finished;*/
default:
image.append((char)ch);
escaped = false;
}
}
line = input.getLineNumber();
column = input.getColumnNumber();
ch = input.read();
}
token.image = image.toString();
token.endLine = line;
token.endColumn = column;
return token;
}
public Token operator(int ch) throws IOException
{
GenericInputStream input = _input;
switch(ch) {
case '!':
ch = input.read();
if (ch == '=') {
ch = input.read();
if (ch == '=') {
return createToken(EXACT_NOT_EQUAL, "!==");
} else {
_backup = ch;
return createToken(NOT_EQUAL, "!=");
}
} else if (ch == '~') {
return createToken(NO_MATCH, "!~");
} else {
_backup = ch;
return createToken(NEGATION, "!");
}
case '%':
ch = input.read();
if (ch == '=') {
return createToken(ASSIGN_REMAINDER, "%=");
} else {
_backup = ch;
return createToken(REMAINDER, "%");
}
case '&':
ch = input.read();
if (ch == '&') {
return createToken(BOOLEAN_AND, "&&");
} else if (ch == '=') {
return createToken(ASSIGN_CONCAT, "&=");
} else {
_backup = ch;
return createToken(ET, "&");
}
case '*':
ch = input.read();
if (ch == '=') {
return createToken(ASSIGN_MULTIPLY, "*=");
} else {
_backup = ch;
return createToken(STAR, "*");
}
case '?':
ch = input.read();
if (ch == '?') {
return createToken(HOOKHOOK, "??");
} else if (ch == '=') {
return createToken(ASSIGN_INIT, "=?");
} else {
_backup = ch;
return createToken(HOOK, "?");
}
case '+':
ch = input.read();
if (ch == '+') {
return createToken(PLUSPLUS, "++");
} else if (ch == '=') {
return createToken(ASSIGN_ADD, "+=");
} else {
_backup = ch;
return createToken(PLUS, "+");
}
case '-':
ch = input.read();
if (ch == '-') {
return createToken(MINUSMINUS, "--");
} else if (ch == '=') {
return createToken(ASSIGN_SUBSTRACT, "-=");
} else if (ch == '>') {
return createToken(ARROW, "->");
} else {
_backup = ch;
return createToken(MINUS, "-");
}
case '.':
ch = input.read();
if (ch == '.') {
return createToken(RANGE, "..");
} else {
_backup = ch;
return createToken(DOT, ".");
}
case '/':
ch = input.read();
if (ch == '*') {
return skipToEndOfComment();
} else if (ch == '/') {
return skipToEndOfLineComment("//");
} else if (ch == '=') {
return createToken(ASSIGN_DIVIDE, "/=");
} else {
if (_allowPattern) {
return readPattern(ch);
} else {
_backup = ch;
return createToken(SLASH, "/");
}
}
case '<':
ch = input.read();
if (ch == '=') {
ch = input.read();
if (ch == '>') {
return createToken(COMPARE, "<=>");
} else if (ch == '=') {
ch = input.read();
if (ch == '>') {
return createToken(EXACT_COMPARE, "<==>");
}
_backup = ch;
} else {
_backup = ch;
return createToken(LESS_OR_EQUAL, "<=");
}
} else {
_backup = ch;
return createToken(LESS, "<");
}
case '=':
ch = input.read();
if (ch == '=') {
ch = input.read();
if (ch == '=') {
return createToken(EXACT_EQUAL, "===");
} else {
_backup = ch;
return createToken(EQUAL, "==");
}
} else if (ch == '>') {
return createToken(MAP, "=>");
} else {
_backup = ch;
return createToken(ASSIGN, "=");
}
case '>':
ch = input.read();
if (ch == '=') {
return createToken(GREATER_OR_EQUAL, ">=");
} else {
_backup = ch;
return createToken(GREATER, ">");
}
case '^':
ch = input.read();
if (ch == '^') {
return createToken(BOOLEAN_XOR, "^^");
} else {
_backup = ch;
return createToken(CARET, "^");
/*_parser.error(_parser.toLocation(_beginLine, _beginColumn),
"Invalid operator ^, ^^ assumed");
_beginColumn++;
return createToken(BOOLEAN_XOR, "^");*/
}
case '|':
ch = input.read();
if (ch == '|') {
return createToken(BOOLEAN_OR, "||");
} else {
_backup = ch;
return createToken(PIPE, "|");
}
default:
return null;
}
}
public Token readWhitespace(int ch) throws IOException
{
GenericInputStream input = _input;
StringBuffer image = _image;
int line, column;
image.setLength(0);
image.append((char)ch);
out: for(;;) {
line = input.getLineNumber();
column = input.getColumnNumber();
ch = input.read();
switch(ch) {
case -1:
break out;
case ' ':
case '\t':
case '\r':
case '\n':
case 0xa0: //
image.append((char)ch);
break;
default:
backup(ch, line, column);
break out;
}
}
return createToken(WHITESPACE, image.toString());
}
public Token getNextToken()
{
Token t = doGetNextToken();
switch(t.kind) {
case WHITESPACE:
if (!_returnIgnorable) {
return getNextToken();
}
return t;
case COMMENT:
if (!_returnIgnorable) {
return getNextToken();
}
return t;
case DOC_COMMENT:
_previous = t;
if (!_returnIgnorable) {
return getNextToken();
}
return t;
default:
Token p = _previous;
if (p != null && p.kind == DOC_COMMENT) {
String doc = p.image;
doc = doc.substring(3, doc.length() - 2);
StringBuffer buffer = _image;
buffer.setLength(0);
int n = doc.length();
int i = 0;
char ch;
boolean skip = true;
int cut = 0;
while(i<n) {
ch = doc.charAt(i++);
if (skip) {
buffer.append(ch);
switch(ch) {
case ' ':
case '\t':
case 0xa0:
break;
case '\n':
cut = buffer.length();
break;
case '*':
buffer.setLength(cut);
default:
skip = false;
}
} else {
buffer.append(ch);
if (ch == '\n') {
skip = true;
cut = buffer.length();
}
}
}
t.document = buffer.toString().trim();
}
_previous = t;
return t;
}
}
public Token doGetNextToken()
{
boolean returnIgnorable = _returnIgnorable;
GenericInputStream input = _input;
Token t;
int ch;
try {
readMore:
for(;;) {
if (_backup != -1) {
ch = _backup;
_backup = -1;
} else {
_beginLine = input.getLineNumber();
_beginColumn = input.getColumnNumber();
ch = input.read();
}
switch(ch) {
case -1:
return eof();
case ' ':
case '\t':
case '\r':
case '\n':
case 0xa0:
if (returnIgnorable) {
return readWhitespace(ch);
} else {
continue readMore;
}
case '#':
t = skipToEndOfLineComment("#");
if (t != null) {
return t;
} else {
continue readMore;
}
case '\"':
return readString('\"');
case '\'':
return readString('\'');
case '`':
return readString('`');
case ';':
return createToken(SEMICOLON, ";");
case '{':
return createToken(BEGIN, "{");
case '}':
return createTokenNP(END, "}");
case '(':
return createToken(OPEN, "(");
case ')':
return createTokenNP(CLOSE, ")");
case '[':
return createToken(OPEN_BRACKET, "[");
case ']':
return createTokenNP(CLOSE_BRACKET, "]");
case ':':
return createToken(COLON, ":");
case '@':
return createToken(AT, "@");
case ',':
return createToken(COMMA, ",");
case '~':
return createToken(MATCH, "~");
case '.':
case '!':
case '=':
case '<':
case '>':
case '+':
case '-':
case '*':
case '%':
case '|':
case '&':
case '^':
case '/':
case '?':
t = operator(ch);
if (t == null) {
continue readMore;
} else {
if ((_backup != -1) && (t.kind != PATTERN) && (t.kind != COMMENT)) {
_beginColumn += t.image.length();
}
return t;
}
case '\\':
ch = input.read();
if (ch == '{') {
return createToken(BEGIN_LIST, "\\{");
} else if (Character.isJavaIdentifierStart((char)ch)) {
Token token = readSymbol(ch);
token.kind = STRING_LITERAL;
token.image = '"' + token.image + '"';
return token;
} else {
_backup = ch;
_parser.error(_parser.toLocation(_beginLine, _beginColumn),
"Invalid use of '\\'");
}
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return readNumber((char)ch);
case '$':
return readSymbol(0);
default:
if (Character.isJavaIdentifierStart((char)ch)) {
return readSymbol((char)ch);
} else {
_parser.error(_parser.toLocation(_beginLine, _beginColumn),
"Invalid character '"+(char)ch+"' (0x"+Integer.toString(ch, 16)+") on input ");
}
}
}
} catch (IOException e) {
_parser.error(_parser.toLocation(input.getLineNumber(),
input.getColumnNumber()), "IO-error: " + e.getMessage());
return eof();
}
}
}