/*
* Copyright (C) 2011 Alasdair C. Hamilton
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package ket.math.convert;
import java.util.regex.*;
import ket.math.*;
import ket.math.purpose.Text;
import ket.math.purpose.VariableToken;
import ket.math.purpose.Word;
import ketUI.Ket;
/**
* Process a given string representation of an equation and convert it into a
* series of tokens given a collection of the symbols that constitute a list of
* functions and variables.
*/
public class Tokenization {
private static final String WORD_REGEX = "[a-zA-Z][a-zA-Z\\d]*";
private static final Pattern WORD = Pattern.compile(WORD_REGEX);
private static final Pattern QUOTES = Pattern.compile("^\"[^\"]*\"");
private static final Pattern DOUBLE = Pattern.compile("\\d+\\.(?:\\d*)?(?:[eE][+-]?\\d+)?");
private static final Pattern INTEGER = Pattern.compile("\\d+(?:[eE][+-]?\\d+)?");
private static final Pattern WHITE_SPACE = Pattern.compile("^\\s+");
private Tokenization(String equationString, KnownArguments knownArguments) {
// Not used.
}
/**
* Convert an equation into a series of tokens which are stored inside a
* branch instance. Words, numbers, text and ... are converted to tokens
* while whitespace is discarded.
* @param equationString A string representation of the equation to be parsed.
* @param knownArguments The repository of all functions that have been defined at runtime or by the user.
* @return A branch object, the arguments of which is a token.
*/
public static Branch tokenize(String equationString, KnownArguments knownArguments) {
StringBuffer equation = new StringBuffer(equationString + " ");
Branch tokenList = new Branch();
while (equation.length()>0) {
char nextCharacter = equation.charAt(0);
if (Character.isDigit(nextCharacter)) {
tokenList.append(readNextNumber(equation)); // Note: Generalization: may return a branch!
} else if (nextCharacter=='"') {
String quotedString = readNext(QUOTES, equation);
if (quotedString!=null) {
String string = quotedString.substring(1, quotedString.length()-1);
Text text = new Text(string);
Token token = new Token(text);
tokenList.append(token);
} else {
Token unknown = readNextSymbol(equation, knownArguments);
tokenList.append(unknown);
}
} else if (Character.isWhitespace(nextCharacter)) {
readNext(WHITE_SPACE, equation);
} else if (Character.isLetter(nextCharacter)) {
String string = readNext(WORD, equation);
if (string==null) {
Token unknown = readNextSymbol(equation, knownArguments);
tokenList.append(unknown);
continue;
}
Word word = new Word(string);
Token variable = new VariableToken(word);
tokenList.append(variable);
} else {
Token unknown = readNextSymbol(equation, knownArguments);
tokenList.append(unknown);
}
}
return tokenList;
}
/**
* Read the next regular-expression-defined symbol in equation
* and encapsulate it in a token object.
* @param equation A string representation of an equation which starts with a symbol.
* @return The token to be returned.
*/
private static Token readNextSymbol(StringBuffer equation, KnownArguments knownArguments) {
for (Symbol symbol : knownArguments.getOperandSymbolSet()) {
// Compare the each known symbol against the given string.
Token token = symbol.matchNextToken(equation);
if (token!=null) {
return token;
}
}
// Add unknown symbol as a new token.
Ket.out.println(" --- unknown character --- ");
Ket.out.println("An unknown character was found while parsing and has been added as a separate block of text. ");
String firstLetter = equation.substring(0, 1);
Ket.out.println("first letter = '" + firstLetter + "'.");
equation.deleteCharAt(0);
Ket.out.println(" tokenize: #2 /"+firstLetter+"/");
return new Token(new Text(firstLetter));
}
/**
* Read the next regular expression.
*/
private static String readNext(Pattern pattern, StringBuffer equation) {
Matcher matcher = pattern.matcher(equation);
if (matcher.find() && matcher.start()==0) {
String content;
if (matcher.groupCount()>0) {
content = matcher.group(matcher.groupCount());
equation.delete(0, matcher.end(matcher.groupCount()));
} else {
content = matcher.group();
equation.delete(0, matcher.end());
}
return content;
} else {
return null;
}
}
/**
* Read a double or integer pattern from equation.
* @param equation The string representation of an equation to be parsed which begins with a number.
* @return the next number from the left of equation encapsulated as an integer token or double token.
*/
private static Argument readNextNumber(StringBuffer equation) {
String doubleString = readNext(DOUBLE, equation);
if (doubleString!=null) {
//- double doubleValue = Double.parseDouble(doubleString);
String[] parts = doubleString.split("[eE]");
switch (parts.length) {
case 1: // mantissa
double doubleValue = Double.parseDouble(parts[0]);
return new Token(doubleValue);
case 2: // mantissa E exponent
double mantissa = Double.parseDouble(parts[0]);
int exponent = Integer.parseInt(parts[1]);
Branch power = new Branch(Function.POWER, new Token(10), new Token(exponent));
return new Branch(Function.TIMES, new Token(mantissa), power);
default:
Ket.out.println(" !!! Bug: cannot interpret number format: !!! ");
Ket.out.println(doubleString);
return new VariableToken(new Text(doubleString));
}
} else {
String integerString = readNext(INTEGER, equation);
String[] parts = integerString.split("[eE]");
switch (parts.length) {
case 1: // mantissa
int intValue = Integer.parseInt(parts[0]);
return new Token(intValue);
case 2: // mantissa E exponent
int mantissa = Integer.parseInt(parts[0]);
int exponent = Integer.parseInt(parts[1]);
Branch power = new Branch(Function.POWER, new Token(10), new Token(exponent));
return new Branch(Function.TIMES, new Token(mantissa), power);
default:
Ket.out.println(" !!! Bug: cannot interpret number format: !!! ");
Ket.out.println(integerString);
return new VariableToken(new Text(integerString));
}
}
}
}