// Copyright (C) 2005 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.caja.parser;
import com.google.caja.lexer.JsTokenQueue;
import com.google.caja.lexer.JsTokenType;
import com.google.caja.lexer.Keyword;
import com.google.caja.lexer.ParseException;
import com.google.caja.lexer.Token;
import com.google.caja.reporting.Message;
import com.google.caja.reporting.MessagePart;
import com.google.caja.reporting.MessageQueue;
import com.google.caja.reporting.MessageType;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Some identifier, keyword, and number handling routines used by multiple
* parsers.
*
* @author mikesamuel@gmail.com
* @author ihab.awad@gmail.com
*/
public abstract class ParserBase {
protected final JsTokenQueue tq;
protected final MessageQueue mq;
protected final boolean isQuasiliteral;
public ParserBase(JsTokenQueue tq, MessageQueue mq) {
this(tq, mq, false);
}
public ParserBase(JsTokenQueue tq, MessageQueue mq, boolean isQuasiliteral) {
this.tq = tq;
this.mq = mq;
this.isQuasiliteral = isQuasiliteral;
}
public JsTokenQueue getTokenQueue() { return tq; }
public MessageQueue getMessageQueue() { return mq; }
public String parseIdentifier(boolean allowReservedWords)
throws ParseException {
Token<JsTokenType> t = tq.peek();
String s = t.text;
switch (t.type) {
case WORD:
if (!allowReservedWords) {
Keyword k = Keyword.fromString(decodeIdentifier(s));
if (null != k) {
mq.addMessage(MessageType.RESERVED_WORD_USED_AS_IDENTIFIER,
tq.currentPosition(), k);
}
}
if (!isIdentifier(s)) {
throw new ParseException(
new Message(MessageType.INVALID_IDENTIFIER,
tq.currentPosition(), MessagePart.Factory.valueOf(s))
);
}
break;
case KEYWORD:
if (!allowReservedWords) {
mq.addMessage(MessageType.RESERVED_WORD_USED_AS_IDENTIFIER,
tq.currentPosition(), Keyword.fromString(s));
}
break;
default:
throw new ParseException(
new Message(MessageType.EXPECTED_TOKEN, tq.currentPosition(),
MessagePart.Factory.valueOf("an identifier"),
MessagePart.Factory.valueOf(s)));
}
tq.advance();
return decodeIdentifier(s);
}
/**
* String form of a regular expression that matches the javascript
* IdentifierOrKeyword production, with extensions for quasiliteral
* syntax.
* <p>From section 7.6 of EcmaScript 262 Edition 3 (ES3), currently found at
* http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
* and based on http://www.erights.org/elang/grammar/quasi-overview.html
* <pre>
* <b>QuasiIdentifierOrKeyword</b> ->
* IdentifierOrKeyword
* | QuasiliteralBegin IdentifierOrKeyword OptQuasiliteralQuantifier
* <b>IdentifierOrKeyword</b> ->
* IdentifierName (but not Keyword)
* <b>IdentifierName</b> ->
* IdentifierStart
* | IdentifierName IdentifierPart
* <b>IdentifierStart</b> ->
* UnicodeLetter | $ | _ | \ UnicodeEscapeSequence
* <b>IdentifierPart</b> ->
* IdentifierStart | UnicodeCombiningMark | UnicodeDigit
* | UnicodeConnectorPunctuation | \ UnicodeEscapeSequence
* <b>UnicodeLetter</b> ->
* any character in the Unicode categories "Uppercase letter
* (Lu)", "Lowercase letter (Ll)", "Titlecase letter (Lt)",
* "Modifier letter (Lm)", "Other letter (Lo)", or "Letter
* number (Nl)".
* <b>UnicodeCombiningMark</b> ->
* any character in the Unicode categories "Non-spacing mark (Mn)"
* or "Combining spacing mark (Mc)"
* <b>UnicodeDigit</b> ->
* any character in the Unicode category "Decimal number (Nd)"
* <b>UnicodeConnectorPunctuation</b> ->
* any character in the Unicode category "Connector punctuation (Pc)"
* <b>UnicodeEscapeSequence</b> ->
* u HexDigit HexDigit HexDigit HexDigiti
* <b>HexDigit</b> ->
* 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a
* | b | c | d | e | f | A | B | C | D | E | F
* <b>QuasiliteralBegin</b> ->
* '@'
* <b>OptQuasiliteralQuantifier</b> ->
* ε
* | '*'
* | '+'
* | '?'
* </pre>
* A <i>UnicodeEscapeSequence</i> cannot be used to put a character
* into an identifier that would otherwise be illegal.
*/
private static final Pattern IDENTIFIER_OR_KEYWORD_RE;
private static final Pattern QUASI_IDENTIFIER_OR_KEYWORD_RE;
private static final Pattern UNICODE_ESCAPE; // hexDigits captured in group 1
static {
String hexDigit = "[0-9a-fA-F]";
String letter = "\\p{javaLetter}";
String letterOrDigit = "\\p{javaLetterOrDigit}";
String combinerOrConnector = "\\p{Mn}\\p{Mc}\\p{Pc}";
String identifierStart = "[" + letter + "$_]";
String identifierPart = "[" + letterOrDigit + combinerOrConnector + "$_]";
String quasiliteralBegin = "@";
String optQuasiliteralQuantifier = "[\\+\\*\\?]?";
String identifierOrKeyword = identifierStart + identifierPart + "*";
IDENTIFIER_OR_KEYWORD_RE = Pattern.compile("^" + identifierOrKeyword + "$");
String quasiIdentifierOrKeyword = (
"(?:" + identifierOrKeyword + ")"
+ "|"
+ "(?:" + (
quasiliteralBegin
+ identifierOrKeyword
+ optQuasiliteralQuantifier)
+ ")");
QUASI_IDENTIFIER_OR_KEYWORD_RE = Pattern.compile(
"^" + quasiIdentifierOrKeyword + "$");
UNICODE_ESCAPE = Pattern.compile("\\\\u(" + hexDigit + "{4})");
}
public static boolean isJavascriptIdentifier(String s) {
return IDENTIFIER_OR_KEYWORD_RE.matcher(decodeIdentifier(s)).matches()
&& Normalizer.isNormalized(s);
}
public static boolean isQuasiIdentifier(String s) {
return QUASI_IDENTIFIER_OR_KEYWORD_RE.matcher(decodeIdentifier(s)).matches()
&& Normalizer.isNormalized(s);
}
public boolean isIdentifier(String s) {
return (isQuasiliteral
? QUASI_IDENTIFIER_OR_KEYWORD_RE
: IDENTIFIER_OR_KEYWORD_RE).matcher(decodeIdentifier(s)).matches()
&& Normalizer.isNormalized(s);
}
/**
* Decodes escapes in an identifier to their literal codepoints so that
* identifiers can be compared for equality via string equality of their
* values.
*/
public static String decodeIdentifier(String identifier) {
// TODO(mikesamuel): is this true?
// Javascript identifiers use a different escaping scheme from strings.
// Specifically, \Uxxxxxxxx escapes handle extended unicode. There are
// 8 hex digits allowed even though extended unicode can't use more than
// 6 of those.
if (identifier.indexOf('\\') < 0) { return identifier; }
StringBuffer sb = new StringBuffer();
Matcher m = UNICODE_ESCAPE.matcher(identifier);
while (m.find()) {
m.appendReplacement(sb, "");
sb.append((char) Integer.parseInt(m.group(1), 16));
}
m.appendTail(sb);
return sb.toString();
}
}