Source Code of com.google.caja.lexer.HtmlInputSplitter

// Copyright (C) 2005 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


package com.google.caja.lexer;


import com.google.caja.SomethingWidgyHappenedError;
import com.google.caja.util.Strings;


import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;


/**
 * A flexible lexer for html, gxp, and related document types.
 *
 * @author mikesamuel@gmail.com
 */
public final class HtmlLexer extends AbstractTokenStream<HtmlTokenType> {
  private final HtmlInputSplitter splitter;
  private State state = State.OUTSIDE_TAG;


  public HtmlLexer(CharProducer p) {
    this.splitter = new HtmlInputSplitter(p);
  }


  /**
   * True iff this is treated as xml.  Xml-ness affects the treatment of
   * script tags, which must be CDATA or html-escaped in GXPs and other xml
   * types, but are specially handled by html-parsers.
   */
  public boolean getTreatedAsXml() {
    return splitter.getTreatedAsXml();
  }


  /** @see #getTreatedAsXml */
  public void setTreatedAsXml(boolean asXml) {
    splitter.setTreatedAsXml(asXml);
  }


  /**
   * An fsm that lets us reclassify text tokens inside tags as attribute
   * names/values
   */
  private static enum State {
    OUTSIDE_TAG,
    IN_TAG,
    SAW_NAME,
    SAW_EQ,
    ;
  }


  /**
   * Makes sure that this.token contains a token if one is available.
   * This may require fetching and combining multiple tokens from the underlying
   * splitter.
   */
  @Override
  protected Token<HtmlTokenType> produce() throws ParseException {
    Token<HtmlTokenType> token = readToken();
    if (token == null) { return null; }


    switch (token.type) {


      // Keep track of whether we're inside a tag or not.
      case TAGBEGIN:
        state = State.IN_TAG;
        break;
      case TAGEND:
        if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type
            && !getTreatedAsXml()) {
          // Distinguish <input type=checkbox checked=> from
          // <input type=checkbox checked>
          pushbackToken(token);
          state = State.IN_TAG;
          return Token.instance("", HtmlTokenType.ATTRVALUE,
                                FilePosition.startOf(token.pos));
        }


        state = State.OUTSIDE_TAG;
        break;


      // Drop ignorable tokens by zeroing out the one received and recursing
      case IGNORABLE:
        return produce();


      // collapse adjacent text nodes if we're outside a tag, or otherwise,
      // Recognize attribute names and values.
      default:
        switch (state) {
          case OUTSIDE_TAG:
            if (HtmlTokenType.TEXT == token.type
                || HtmlTokenType.UNESCAPED == token.type) {
              token = collapseSubsequent(token);
            }
            break;
          case IN_TAG:
            if (HtmlTokenType.TEXT == token.type && !"=".equals(token.text)) {
              // Reclassify as attribute name
              token = HtmlInputSplitter.reclassify(
                  token, HtmlTokenType.ATTRNAME);
              state = State.SAW_NAME;
            }
            break;
          case SAW_NAME:
            if (HtmlTokenType.TEXT == token.type) {
              if ("=".equals(token.text)) {
                state = State.SAW_EQ;
                // Skip the '=' token
                return produce();
              } else {
                // Reclassify as attribute name
                token = HtmlInputSplitter.reclassify(
                    token, HtmlTokenType.ATTRNAME);
              }
            } else {
              state = State.IN_TAG;
            }
            break;
          case SAW_EQ:
            if (HtmlTokenType.TEXT == token.type
                || HtmlTokenType.QSTRING == token.type) {
              if (HtmlTokenType.TEXT == token.type) {
                // Collapse adjacent text nodes to properly handle
                //   <a onclick=this.clicked=true>
                //   <a title=foo bar>
                token = collapseAttributeName(token);
              }
              // Reclassify as value
              token = HtmlInputSplitter.reclassify(
                  token, HtmlTokenType.ATTRVALUE);
              state = State.IN_TAG;
            }
            break;
        }
        break;
    }


    return token;
  }


  /**
   * Collapses all the following tokens of the same type into this.token.
   */
  private Token<HtmlTokenType> collapseSubsequent(Token<HtmlTokenType> token)
      throws ParseException {
    Token<HtmlTokenType> collapsed = token;
    for (Token<HtmlTokenType> next;
         (next= peekToken(0)) != null && next.type == token.type;
         readToken()) {
      collapsed = join(collapsed, next);
    }
    return collapsed;
  }


  private Token<HtmlTokenType> collapseAttributeName(Token<HtmlTokenType> token)
      throws ParseException {
    if (getTreatedAsXml()) { return token; }
    // We want to collapse tokens into the value that are not parts of an
    // attribute value.  We should include any space or text adjacent to the
    // value, but should stop at any of the following constructions:
    //   space end-of-file              e.g. name=foo_
    //   space valueless-attrib-name    e.g. name=foo checked
    //   space tag-end                  e.g. name=foo />
    //   space text space? '='          e.g. name=foo bar=
    int nToMerge = 0;
    for (Token<HtmlTokenType> t; (t = peekToken(nToMerge)) != null;) {
      if (t.type == HtmlTokenType.IGNORABLE) {
        Token<HtmlTokenType> text = peekToken(nToMerge + 1);
        if (text == null) { break; }
        if (text.type != HtmlTokenType.TEXT) { break; }
        if (isValuelessAttribute(text.text)) { break; }
        Token<HtmlTokenType> eq = peekToken(nToMerge + 2);
        if (eq != null && eq.type == HtmlTokenType.IGNORABLE) {
          eq = peekToken(nToMerge + 3);
        }
        if (eq == null || "=".equals(eq.text)) { break; }
      } else if (t.type != HtmlTokenType.TEXT) {
        break;
      }
      ++nToMerge;
    }
    if (nToMerge == 0) { return token; }
    StringBuilder sb = new StringBuilder(token.text);
    Token<HtmlTokenType> t;
    do {
      t = readToken();
      sb.append(t.text);
    } while (--nToMerge > 0);
    return Token.instance(
        sb.toString(), HtmlTokenType.TEXT, FilePosition.span(token.pos, t.pos));
  }


  private static Token<HtmlTokenType> join(
      Token<HtmlTokenType> a, Token<HtmlTokenType> b) {
    return Token.instance(
        a.text + b.text, a.type, FilePosition.span(a.pos, b.pos));
  }


  private final LinkedList<Token<HtmlTokenType>> lookahead
      = new LinkedList<Token<HtmlTokenType>>();
  private Token<HtmlTokenType> readToken() throws ParseException {
    if (!lookahead.isEmpty()) {
      return lookahead.remove();
    } else if (splitter.hasNext()) {
      return splitter.next();
    } else {
      return null;
    }
  }


  private Token<HtmlTokenType> peekToken(int i) throws ParseException {
    while (lookahead.size() <= i && splitter.hasNext()) {
      lookahead.add(splitter.next());
    }
    return lookahead.size() > i ? lookahead.get(i) : null;
  }


  private void pushbackToken(Token<HtmlTokenType> token) {
    lookahead.addFirst(token);
  }


  /** Can the attribute appear in HTML without a value. */
  private static boolean isValuelessAttribute(String attribName) {
    boolean valueless = VALUELESS_ATTRIB_NAMES.contains(
        Strings.lower(attribName));
    return valueless;
  }


  // From http://issues.apache.org/jira/browse/XALANC-519
  private static final Set<String> VALUELESS_ATTRIB_NAMES = new HashSet<String>(
      Arrays.asList("checked", "compact", "declare", "defer", "disabled",
                    "ismap", "multiple", "nohref", "noresize", "noshade",
                    "nowrap", "readonly", "selected"));
}


/**
 * A token stream that breaks a character stream into <tt>
 * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}</tt>
 * tokens.  The matching of attribute names and values is done in a later step.
 */
final class HtmlInputSplitter extends AbstractTokenStream<HtmlTokenType> {
  /** Should the input be considered xml?  are escape exempt blocks allowed? */
  private boolean asXml = false;


  /** The source of HTML character data. */
  private final CharProducer p;
  /** True iff the current character is inside a tag. */
  private boolean inTag;
  /**
   * True if inside a script, xmp, listing, or similar tag whose content does
   * not follow the normal escaping rules.
   */
  private boolean inEscapeExemptBlock;


  /**
   * Null or the name of the close tag required to end the current escape exempt
   * block.
   * Preformatted tags include &lt;script&gt;, &lt;xmp&gt;, etc. that may
   * contain unescaped HTML input.
   */
  private String escapeExemptTagName = null;


  private HtmlTextEscapingMode textEscapingMode;


  public HtmlInputSplitter(CharProducer p) {
    this.p = p;
  }


  /**
   * True iff this is treated as xml.  Xml-ness affects the treatment of
   * script tags, which must be CDATA or HTML-escaped in GXPs and other xml
   * types, but are specially handled by HTML parsers.
   */
  public boolean getTreatedAsXml() {
    return this.asXml;
  }


  /** @see #getTreatedAsXml */
  public void setTreatedAsXml(boolean asXml) {
    this.asXml = asXml;
  }


  /**
   * Make sure that there is a token ready to yield in this.token.
   */
  @Override
  protected Token<HtmlTokenType> produce() {
    Token<HtmlTokenType> token = parseToken();
    if (null == token) { return null; }


    // Handle escape-exempt blocks.
    // The parse() method is only dimly aware of escape-excempt blocks, so
    // here we detect the beginning and ends of escape exempt blocks, and
    // reclassify as UNESCAPED, any tokens that appear in the middle.
    if (inEscapeExemptBlock) {
      if (token.type != HtmlTokenType.SERVERCODE) {
        // classify RCDATA as text since it can contain entities
        token = reclassify(
            token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA
                    ? HtmlTokenType.TEXT
                    : HtmlTokenType.UNESCAPED));
      }
    } else if (!asXml) {
      switch (token.type) {
        case TAGBEGIN:
          {
            String canonTagName = name(token.text.substring(1));
            if (HtmlTextEscapingMode.isTagFollowedByLiteralContent(
                    canonTagName)) {
              this.escapeExemptTagName = canonTagName;
              this.textEscapingMode = HtmlTextEscapingMode.getModeForTag(
                  canonTagName);
            }
            break;
          }
        case TAGEND:
          this.inEscapeExemptBlock = null != this.escapeExemptTagName;
          break;
        default:
          break;
      }
    }
    return token;
  }


  /**
   * States for a state machine for optimistically identifying tags and other
   * html/xml/phpish structures.
   */
  private static enum State {
    TAGNAME,
    SLASH,
    BANG,
    CDATA,
    CDATA_SQ_1,
    CDATA_SQ_2,
    BANG_DASH,
    COMMENT,
    COMMENT_DASH,
    COMMENT_DASH_DASH,
    IE_DH_COMMENT_BEGIN,
    IE_DH_COMMENT_END,
    IE_DR_COMMENT_BEGIN,
    IE_DR_COMMENT_END,
    DIRECTIVE,
    DONE,
    APP_DIRECTIVE,
    APP_DIRECTIVE_QMARK,
    SERVER_CODE,
    SERVER_CODE_PCT,


    // From HTML 5 section 8.1.2.6


    // The text in CDATA and RCDATA elements must not contain any
    // occurrences of the string "</" followed by characters that
    // case-insensitively match the tag name of the element followed
    // by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
    // U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE,
    // U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless
    // that string is part of an escaping text span.


    // An escaping text span is a span of text (in CDATA and RCDATA
    // elements) and character entity references (in RCDATA elements)
    // that starts with an escaping text span start that is not itself
    // in an escaping text span, and ends at the next escaping text
    // span end.


    // An escaping text span start is a part of text that consists of
    // the four character sequence "<!--".


    // An escaping text span end is a part of text that consists of
    // the three character sequence "-->".


    // An escaping text span start may share its U+002D HYPHEN-MINUS characters
    // with its corresponding escaping text span end.
    UNESCAPED_LT_BANG,             // <!
    UNESCAPED_LT_BANG_DASH,        // <!-
    ESCAPING_TEXT_SPAN,            // Inside an escaping text span
    ESCAPING_TEXT_SPAN_DASH,       // Seen - inside an escaping text span
    ESCAPING_TEXT_SPAN_DASH_DASH,  // Seen -- inside an escaping text span
    ;
  }


  private String lastNonIgnorable = null;


  /**
   * Breaks the character stream into tokens.
   * This method returns a stream of tokens such that each token starts where
   * the last token ended.
   *
   * <p>This property is useful as it allows fetch to collapse and reclassify
   * ranges of tokens based on state that is easy to maintain there.
   *
   * <p>Later passes are responsible for throwing away useless tokens.
   */
  private Token<HtmlTokenType> parseToken() {
    // TODO(mikesamuel): rewrite with a transition table or just use ANTLR
    int start = p.getOffset();
    int limit = p.getLimit();
    if (start == limit) { return null; }


    char[] buffer = p.getBuffer();


    int end = start + 1;
    HtmlTokenType type;


    char ch = buffer[start];
    if (inTag) {
      if ('>' == ch) {
        type = HtmlTokenType.TAGEND;
        inTag = false;
      } else if ('/' == ch) {
        if (end != limit && '>' == buffer[end]) {
          type = HtmlTokenType.TAGEND;
          inTag = false;
          ++end;
        } else {
          type = HtmlTokenType.TEXT;
        }
      } else if ('=' == ch) {
        type = HtmlTokenType.TEXT;
      } else if ('"' == ch || '\'' == ch) {
        if ("=".equals(lastNonIgnorable)) {
          type = HtmlTokenType.QSTRING;
          int delim = ch;
          for (; end < limit; ++end) {
            if (buffer[end] == delim) {
              ++end;
              break;
            }
          }
        } else {
          type = HtmlTokenType.TEXT;
        }
      } else if (!Character.isWhitespace(ch)) {
        type = HtmlTokenType.TEXT;
        for (; end < limit; ++end) {
          ch = buffer[end];
          // End a text chunk before />
          if (!"=".equals(lastNonIgnorable) && '/' == ch && end + 1 < limit
              && '>' == buffer[end + 1]) {
            break;
          } else if ('>' == ch || '=' == ch
                     || Character.isWhitespace(ch)) {
            break;
          } else if ('"' == ch || '\'' == ch) {
            if (end + 1 < limit) {
              char ch2 = buffer[end + 1];
              if (ch2 >= 0 && Character.isWhitespace(ch2)
                  || ch2 == '>' || ch2 == '/') {
                ++end;
                break;
              }
            }
          }
        }
      } else {
        // We skip whitespace tokens inside tag bodies.
        type = HtmlTokenType.IGNORABLE;
        while (end < limit && Character.isWhitespace(buffer[end])) {
          ++end;
        }
      }
    } else {
      if (ch == '<') {
        if (end == limit) {
          type = HtmlTokenType.TEXT;
        } else {
          ch = buffer[end];
          type = null;
          State state = null;
          switch (ch) {
            case '/':  // close tag?
              state = State.SLASH;
              ++end;
              break;
            case '!':  // Comment or declaration
              if (!this.inEscapeExemptBlock) {
                state = State.BANG;
              } else if (HtmlTextEscapingMode.allowsEscapingTextSpan(
                             escapeExemptTagName)) {
                // Directives, and cdata suppressed in escape
                // exempt mode as they could obscure the close of the
                // escape exempty block, but comments are similar to escaping
                // text spans, and are significant in all CDATA and RCDATA
                // blocks except those inside <xmp> tags.
                // See "Escaping text spans" in section 8.1.2.6 of HTML5.
                // http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions
                state = State.UNESCAPED_LT_BANG;
              }
              ++end;
              break;
            case '?':
              if (!this.inEscapeExemptBlock) {
                state = State.APP_DIRECTIVE;
              }
              ++end;
              break;
            case '%':
              state = State.SERVER_CODE;
              ++end;
              break;
            default:
              if (isIdentStart(ch) && !this.inEscapeExemptBlock) {
                state = State.TAGNAME;
                ++end;
              } else if ('<' == ch) {
                type = HtmlTokenType.TEXT;
              } else {
                ++end;
              }
              break;
          }
          if (null != state) {
            // Number of open IE downlevel-hidden begin markers seen.
            int ieDhCommentsBeginMarkersSeen = 0;
            charloop:
            while (end < limit) {
              ch = buffer[end];
              switch (state) {
                case TAGNAME:
                  if (Character.isWhitespace(ch)
                      || '>' == ch || '/' == ch || '<' == ch) {
                    // End processing of an escape exempt block when we see
                    // a corresponding end tag.
                    if (this.inEscapeExemptBlock && '/' == buffer[start + 1]
                        && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT
                        && name(start + 2, end).equals(escapeExemptTagName)) {
                      this.inEscapeExemptBlock = false;
                      this.escapeExemptTagName = null;
                      this.textEscapingMode = null;
                    }
                    type = HtmlTokenType.TAGBEGIN;
                    // Don't process content as attributes if we're inside
                    // an escape exempt block.
                    inTag = !this.inEscapeExemptBlock;
                    state = State.DONE;
                    break charloop;
                  }
                  break;
                case SLASH:
                  if (Character.isLetter(ch)) {
                    state = State.TAGNAME;
                  } else {
                    if ('<' == ch) {
                      type = HtmlTokenType.TEXT;
                    } else {
                      ++end;
                    }
                    break charloop;
                  }
                  break;
                case BANG:
                  if ('[' == ch && asXml) {
                    state = State.CDATA;
                  } else if ('-' == ch) {
                    state = State.BANG_DASH;
                  } else if (!asXml && lookahead(buffer, end, limit, "[if ")) {
                    state = State.IE_DR_COMMENT_BEGIN;
                  } else if (!asXml &&
                             lookahead(buffer, end, limit, "[endif]>")) {
                    state = State.IE_DR_COMMENT_END;
                  } else {
                    state = State.DIRECTIVE;
                  }
                  break;
                case CDATA:
                  if (']' == ch) { state = State.CDATA_SQ_1; }
                  break;
                case CDATA_SQ_1:
                  if (']' == ch) {
                    state = State.CDATA_SQ_2;
                  } else {
                    state = State.CDATA;
                  }
                  break;
                case CDATA_SQ_2:
                  if ('>' == ch) {
                    type = HtmlTokenType.CDATA;
                    state = State.DONE;
                  } else if (']' != ch) {
                    state = State.CDATA;
                  }
                  break;
                case BANG_DASH:
                  if ('-' == ch) {
                    state = State.COMMENT;
                  } else {
                    state = State.DIRECTIVE;
                  }
                  break;
                case COMMENT:
                  if ('-' == ch) {
                    state = State.COMMENT_DASH;
                  } else if (!asXml && lookahead(buffer, end, limit, "[if ")) {
                    ++ieDhCommentsBeginMarkersSeen;
                    state = State.IE_DH_COMMENT_BEGIN;
                  }
                  break;
                case COMMENT_DASH:
                  state = ('-' == ch)
                      ? State.COMMENT_DASH_DASH
                      : State.COMMENT_DASH;
                  break;
                case COMMENT_DASH_DASH:
                  if ('>' == ch) {
                    state = State.DONE;
                    type = HtmlTokenType.COMMENT;
                  } else if ('-' == ch) {
                    state = State.COMMENT_DASH_DASH;
                  } else {
                    state = State.COMMENT_DASH;
                  }
                  break;
                case DIRECTIVE:
                  if ('>' == ch) {
                    type = HtmlTokenType.DIRECTIVE;
                    state = State.DONE;
                  }
                  break;
                case IE_DH_COMMENT_BEGIN:
                  if (!asXml) {
                    if (lookahead(buffer, end, limit, "<!--[if ")) {
                      ++ieDhCommentsBeginMarkersSeen;
                    } else if (lookahead(buffer, end, limit, "[endif]-->")) {
                      --ieDhCommentsBeginMarkersSeen;
                    }
                    if (ieDhCommentsBeginMarkersSeen == 0){
                      state = State.IE_DH_COMMENT_END;
                    }
                  }
                  break;
                case IE_DH_COMMENT_END:
                  if ('>' == ch) {
                    type = HtmlTokenType.COMMENT;
                    state = State.DONE;
                  }
                  break;
                case IE_DR_COMMENT_BEGIN:
                  if ('>' == ch) {
                    type = HtmlTokenType.IE_DR_COMMENT_BEGIN;
                    state = State.DONE;
                  }
                  break;
                case IE_DR_COMMENT_END:
                  if ('>' == ch) {
                    type = HtmlTokenType.IE_DR_COMMENT_END;
                    state = State.DONE;
                  }
                  break;
                case APP_DIRECTIVE:
                  if ('?' == ch) { state = State.APP_DIRECTIVE_QMARK; }
                  break;
                case APP_DIRECTIVE_QMARK:
                  if ('>' == ch) {
                    type = HtmlTokenType.DIRECTIVE;
                    state = State.DONE;
                  } else if ('?' != ch) {
                    state = State.APP_DIRECTIVE;
                  }
                  break;
                case SERVER_CODE:
                  if ('%' == ch) {
                    state = State.SERVER_CODE_PCT;
                  }
                  break;
                case SERVER_CODE_PCT:
                  if ('>' == ch) {
                    type = HtmlTokenType.SERVERCODE;
                    state = State.DONE;
                  } else if ('%' != ch) {
                    state = State.SERVER_CODE;
                  }
                  break;
                case UNESCAPED_LT_BANG:
                  if ('-' == ch) {
                    state = State.UNESCAPED_LT_BANG_DASH;
                  } else {
                    type = HtmlTokenType.TEXT;
                    state = State.DONE;
                  }
                  break;
                case UNESCAPED_LT_BANG_DASH:
                  if ('-' == ch) {
                    // According to HTML 5 section 8.1.2.6


                    // An escaping text span start may share its
                    // U+002D HYPHEN-MINUS characters with its
                    // corresponding escaping text span end.
                    state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
                  } else {
                    type = HtmlTokenType.TEXT;
                    state = State.DONE;
                  }
                  break;
                case ESCAPING_TEXT_SPAN:
                  if ('-' == ch) {
                    state = State.ESCAPING_TEXT_SPAN_DASH;
                  }
                  break;
                case ESCAPING_TEXT_SPAN_DASH:
                  if ('-' == ch) {
                    state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
                  } else {
                    state = State.ESCAPING_TEXT_SPAN;
                  }
                  break;
                case ESCAPING_TEXT_SPAN_DASH_DASH:
                  if ('>' == ch) {
                    type = HtmlTokenType.TEXT;
                    state = State.DONE;
                  } else if ('-' != ch) {
                    state = State.ESCAPING_TEXT_SPAN;
                  }
                  break;
                case DONE:
                  throw new SomethingWidgyHappenedError(
                      "Unexpectedly DONE while lexing HTML token stream");
              }
              ++end;
              if (State.DONE == state) { break; }
            }
            if (end == limit) {
              switch (state) {
                case DONE:
                  break;
                case CDATA:
                case CDATA_SQ_1:
                case CDATA_SQ_2:
                  type = HtmlTokenType.CDATA;
                  break;
                case COMMENT:
                case COMMENT_DASH:
                case COMMENT_DASH_DASH:
                  type = HtmlTokenType.COMMENT;
                  break;
                case IE_DR_COMMENT_BEGIN:
                  type = HtmlTokenType.IE_DR_COMMENT_BEGIN;
                  break;
                case IE_DR_COMMENT_END:
                  type = HtmlTokenType.IE_DR_COMMENT_END;
                  break;
                case DIRECTIVE:
                case APP_DIRECTIVE:
                case APP_DIRECTIVE_QMARK:
                  type = HtmlTokenType.DIRECTIVE;
                  break;
                case SERVER_CODE:
                case SERVER_CODE_PCT:
                  type = HtmlTokenType.SERVERCODE;
                  break;
                case TAGNAME:
                  type = HtmlTokenType.TAGBEGIN;
                  break;
                default:
                  type = HtmlTokenType.TEXT;
                  break;
              }
            }
          }
        }
      } else {
        type = null;
      }
    }
    if (null == type) {
      while (end < limit && '<' != buffer[end]) { ++end; }
      type = HtmlTokenType.TEXT;
    }


    p.consumeTo(end);
    String text = p.toString(start, end);
    if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = text; }
    return Token.instance(text, type, p.filePositionForOffsets(start, end));
  }


  protected String name(String tagName) {
    return asXml || tagName.indexOf(':') >= 0
        ? tagName : Strings.lower(tagName);
  }


  private static boolean lookahead(
      char [] buffer, int end, int limit, String textToMatch) {
    String currentStr = "";
    int i = end;
    int j = i + textToMatch.length();
    if (j <= limit) {
      for (; i < j; ++i) {
        currentStr += buffer[i];
      }
    }
    return currentStr.equals(textToMatch);
  }


  private String name(int start, int end) {
    return name(p.toString(start, end));
  }


  private boolean isIdentStart(char ch) {
    if (ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a')) { return true; }
    // From the XML Spec:
    //    NameStartChar      ::=      ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6]
    //        | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
    //        | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF]
    //        | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD]
    //        | [#x10000-#xEFFFF]
    if (!asXml) { return false; }
    if (ch < 0x80) {
      return ch == ':' || ch == '_';
    } else {
      return (ch >= 0xc0 && ch <= 0xd6)
          || (ch >= 0xf7 && ch <= 0x2ff)
          || (ch >= 0x370 && ch <= 0x37d)
          || (ch >= 0x37f && ch <= 0x1fff)
          || (ch >= 0x200c && ch <= 0x200d)
          || (ch >= 0x2070 && ch <= 0x218f)
          || (ch >= 0x2c00 && ch <= 0x2fef)
          || (ch >= 0x3001 && ch <= 0xd7ff)
          || (ch >= 0xf900 && ch <= 0xfdcf)
          || (ch >= 0xfdf0 && ch <= 0xfefc)
          || (ch >= 0xff66 && ch <= 0xff9d)
          || (ch >= 0xffa0 && ch <= 0xffdc);
    }
  }


  static <T extends TokenType>
  Token<T> reclassify(Token<T> token, T type) {
    return Token.instance(token.text, type, token.pos);
  }
}
Source Code of com.google.caja.lexer.HtmlInputSplitter

Related Classes of com.google.caja.lexer.HtmlInputSplitter