Package com.google.streamhtmlparser.impl

Source Code of com.google.streamhtmlparser.impl.HtmlParserImpl

/*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.google.streamhtmlparser.impl;

import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.streamhtmlparser.ExternalState;
import com.google.streamhtmlparser.HtmlParser;
import com.google.streamhtmlparser.ParseException;
import com.google.streamhtmlparser.util.CharacterRecorder;
import com.google.streamhtmlparser.util.EntityResolver;
import com.google.streamhtmlparser.util.HtmlUtils;

import java.util.Map;

/**
* A custom specialized parser - ported from the main C++ version - used to
* implement context-aware escaping of run-time data in web-application
* templates.
*
* <p>This is the main class in the package. It implements the
* {@code HtmlParser} interface.
*
* <p>This class is not thread-safe, in particular you cannot invoke any
* state changing operations (such as {@code parse} from multiple threads
* on the same object.
*
* <p>If you are looking at this class, chances are very high you are
* implementing Auto-Escaping for a new template system. Please see the
* landing page including a design document at
* <a href="http://go/autoescape">Auto-Escape Landing Page</a>.
*/
public class HtmlParserImpl extends GenericParser implements HtmlParser {

  /*
   * Internal representation of the parser state, which is at a
   * finer-granularity than the external state as given to callers.
   * The relationship between <code>InternalState</code> and
   * <code>ExternalState</code> is a many-to-one relationship.
   */
  private static final InternalState TEXT;
  private static final InternalState TAG_START;
  private static final InternalState TAG_NAME;
  private static final InternalState DECL_START;
  private static final InternalState DECL_BODY;
  private static final InternalState COM_OPEN;
  private static final InternalState COM_BODY;
  private static final InternalState COM_DASH;
  private static final InternalState COM_DASH_DASH;
  private static final InternalState PI;
  private static final InternalState PI_MAY_END;
  private static final InternalState TAG_SPACE;
  private static final InternalState TAG_CLOSE;
  private static final InternalState ATTR;
  private static final InternalState ATTR_SPACE;
  private static final InternalState VALUE;
  private static final InternalState VALUE_TEXT;
  private static final InternalState VALUE_Q_START;
  private static final InternalState VALUE_Q;
  private static final InternalState VALUE_DQ_START;
  private static final InternalState VALUE_DQ;
  private static final InternalState CDATA_COM_START;
  private static final InternalState CDATA_COM_START_DASH;
  private static final InternalState CDATA_COM_BODY;
  private static final InternalState CDATA_COM_DASH;
  private static final InternalState CDATA_COM_DASH_DASH;
  private static final InternalState CDATA_TEXT;
  private static final InternalState CDATA_LT;
  private static final InternalState CDATA_MAY_CLOSE;
  private static final InternalState JS_FILE;
  private static final InternalState CSS_FILE;

  static {
    TEXT = InternalState.getInstanceHtml("TEXT");
    TAG_START = InternalState.getInstanceHtml("TAG_START");
    TAG_NAME = InternalState.getInstanceHtml("TAG_NAME");
    DECL_START = InternalState.getInstanceHtml("DECL_START");
    DECL_BODY = InternalState.getInstanceHtml("DECL_BODY");
    COM_OPEN = InternalState.getInstanceHtml("COM_OPEN");
    COM_BODY = InternalState.getInstanceHtml("COM_BODY");
    COM_DASH = InternalState.getInstanceHtml("COM_DASH");
    COM_DASH_DASH = InternalState.getInstanceHtml("COM_DASH_DASH");
    PI =InternalState.getInstanceHtml("PI");
    PI_MAY_END = InternalState.getInstanceHtml("PI_MAY_END");
    TAG_SPACE = InternalState.getInstanceHtml("TAG_SPACE");
    TAG_CLOSE = InternalState.getInstanceHtml("TAG_CLOSE");
    ATTR = InternalState.getInstanceHtml("ATTR");
    ATTR_SPACE = InternalState.getInstanceHtml("ATTR_SPACE");
    VALUE = InternalState.getInstanceHtml("VALUE");
    VALUE_TEXT = InternalState.getInstanceHtml("VALUE_TEXT");
    VALUE_Q_START = InternalState.getInstanceHtml("VALUE_Q_START");
    VALUE_Q = InternalState.getInstanceHtml("VALUE_Q");
    VALUE_DQ_START = InternalState.getInstanceHtml("VALUE_DQ_START");
    VALUE_DQ = InternalState.getInstanceHtml("VALUE_DQ");
    CDATA_COM_START = InternalState.getInstanceHtml("CDATA_COM_START");
    CDATA_COM_START_DASH =
        InternalState.getInstanceHtml("CDATA_COM_START_DASH");
    CDATA_COM_BODY = InternalState.getInstanceHtml("CDATA_COM_BODY");
    CDATA_COM_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH");
    CDATA_COM_DASH_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH_DASH");
    CDATA_TEXT = InternalState.getInstanceHtml("CDATA_TEXT");
    CDATA_LT = InternalState.getInstanceHtml("CDATA_LT");
    CDATA_MAY_CLOSE = InternalState.getInstanceHtml("CDATA_MAY_CLOSE");
    JS_FILE = InternalState.getInstanceHtml("JS_FILE");
    CSS_FILE = InternalState.getInstanceHtml("CSS_FILE");
  }

  private static final Map<InternalState, ExternalState> STATE_MAPPING =
      Maps.newHashMap();
  static {
    initializeStateMapping();
  }

  private static final ParserStateTable STATE_TABLE = new ParserStateTable();
  static {
    initializeParserStateTable();
  }

  private final CharacterRecorder tag;
  private final CharacterRecorder attr;
  private final CharacterRecorder value;
  private final CharacterRecorder cdataCloseTag;
  private final EntityResolver entityResolver;
  private final JavascriptParserImpl jsParser;
  private boolean insideJavascript;
  private int valueIndex;
  // True iff InsertText() was called at the start of a URL attribute value.
  private boolean textInsideUrlValue;

  /**
   * Creates an {@code HtmlParserImpl} object.
   *
   * <p>Both for performance reasons and to leverage code a state-flow machine
   * that is automatically generated from Python for multiple target
   * languages, this object uses a static {@code ParserStateTable} that
   * is read-only and obtained from the generated code in {@code HtmlParserFsm}.
   * That code also maintains the mapping from internal states
   * ({@code InternalState}) to external states ({@code ExternalState}).
   */
  public HtmlParserImpl() {
    super(STATE_TABLE, STATE_MAPPING, TEXT);
    tag = new CharacterRecorder();
    attr = new CharacterRecorder();
    value = new CharacterRecorder();
    cdataCloseTag = new CharacterRecorder();
    entityResolver = new EntityResolver();
    jsParser = new JavascriptParserImpl();
    insideJavascript = false;
    valueIndex = 0;
    textInsideUrlValue = false;
  }

  /**
   * Creates an {@code HtmlParserImpl} that is a copy of the one provided.
   *
   * @param aHtmlParserImpl the {@code HtmlParserImpl} object to copy
   */
  public HtmlParserImpl(HtmlParserImpl aHtmlParserImpl) {
    super(aHtmlParserImpl);
    tag = new CharacterRecorder(aHtmlParserImpl.tag);
    attr = new CharacterRecorder(aHtmlParserImpl.attr);
    value = new CharacterRecorder(aHtmlParserImpl.value);
    cdataCloseTag = new CharacterRecorder(aHtmlParserImpl.cdataCloseTag);
    entityResolver = new EntityResolver(aHtmlParserImpl.entityResolver);
    jsParser = new JavascriptParserImpl(aHtmlParserImpl.jsParser);
    insideJavascript = aHtmlParserImpl.insideJavascript;
    valueIndex = aHtmlParserImpl.valueIndex;
    textInsideUrlValue = aHtmlParserImpl.textInsideUrlValue;
  }

  @Override
  public boolean inJavascript() {
    return (insideJavascript
            && ( (getState() == STATE_VALUE)
                 || (currentState == CDATA_TEXT)
                 || (currentState == CDATA_COM_START)
                 || (currentState == CDATA_COM_START_DASH)
                 || (currentState == CDATA_COM_BODY)
                 || (currentState == CDATA_COM_DASH)
                 || (currentState == CDATA_COM_DASH_DASH)
                 || (currentState == CDATA_LT)
                 || (currentState == CDATA_MAY_CLOSE)
                 || (currentState == JS_FILE) ));
  }

  @Override
  public boolean isJavascriptQuoted() {
    if (inJavascript()) {
      ExternalState jsParserState = jsParser.getState();
      return (jsParserState == JavascriptParserImpl.STATE_Q
              || jsParserState == JavascriptParserImpl.STATE_DQ);
    }
    return false;
  }

  @Override
  public boolean inAttribute() {
    ExternalState extState = getState();
    return (extState != null && (extState == STATE_ATTR
                                 || extState == STATE_VALUE));
  }

  /**
   * Returns {@code true} if and only if the parser is currently within
   * a CSS context. A CSS context is one of the below:
   * <ul>
   * <li>Inside a STYLE tag.
   * <li>Inside a STYLE attribute.
   * <li>Inside a CSS file when the parser was reset in the CSS mode.
   * </ul>
   *
   * @return {@code true} if and only if the parser is inside CSS
   */
  @Override
  public boolean inCss() {
    return (currentState == CSS_FILE
            || (getState() == STATE_VALUE
                && (getAttributeType() == ATTR_TYPE.STYLE))
            || ("style".equals(getTag())));
  }

  @Override
  public ATTR_TYPE getAttributeType() {
    String attribute = getAttribute();
    if (!inAttribute()) {
      return ATTR_TYPE.NONE;
    }
    if (HtmlUtils.isAttributeJavascript(attribute)) {
      return ATTR_TYPE.JS;
    }
    if (HtmlUtils.isAttributeUri(attribute)) {
      return ATTR_TYPE.URI;
    }
    if (HtmlUtils.isAttributeStyle(attribute)) {
      return ATTR_TYPE.STYLE;
    }

    // Special logic to handle the "content" attribute of the "meta" tag.
    if ("meta".equals(getTag()) && "content".equals(getAttribute())) {
      HtmlUtils.META_REDIRECT_TYPE redirectType =
          HtmlUtils.parseContentAttributeForUrl(getValue());
      if (redirectType == HtmlUtils.META_REDIRECT_TYPE.URL_START ||
          redirectType == HtmlUtils.META_REDIRECT_TYPE.URL)
        return ATTR_TYPE.URI;
    }
   
    return ATTR_TYPE.REGULAR;
  }

  @Override
  public ExternalState getJavascriptState() {
    return jsParser.getState();
  }

  @Override
  public boolean isAttributeQuoted() {
    return (currentState == VALUE_Q_START
            || currentState == VALUE_Q
            || currentState == VALUE_DQ_START
            || currentState == VALUE_DQ);
  }

  @Override
  public String getTag() {
    return tag.getContent().toLowerCase();
  }

  @Override
  public String getAttribute() {
    return inAttribute() ? attr.getContent().toLowerCase() : "";
  }

  @Override
  public String getValue() {
    return (getState() == STATE_VALUE) ? value.getContent() : "";
  }

  @Override
  public int getValueIndex() {
    if (getState() != STATE_VALUE) {
      return 0;
    }
    return valueIndex;
  }

  @Override
  public boolean isUrlStart() {
    // False when not inside an HTML attribute value
    if (getState() != STATE_VALUE) {
      return false;
    }

    //  Or when the HTML attribute is not of URI type.
    if (getAttributeType() != ATTR_TYPE.URI) {
      return false;
    }

    // Or when we received an InsertText() directive at the start of a URL.
    if (textInsideUrlValue) {
      return false;
    }

    if ("meta".equals(getTag())) {
      // At this point, we know we are in the "content" attribute
      // or we would not have the URI attribute type.
      return (HtmlUtils.parseContentAttributeForUrl(getValue()) ==
              HtmlUtils.META_REDIRECT_TYPE.URL_START);
    }

    // For all other URI attributes, check if we are at index 0.
    return (getValueIndex() == 0);
}

  /**
   * {@inheritDoc}
   *
   * Resets the state of the parser to a state consistent with the
   * {@code Mode} provided. This will reset finer-grained state
   * information back to a default value, hence use only when
   * you want to parse text from a very clean slate.
   *
   * <p>See the {@link HtmlParser.Mode} enum for information on all
   * the valid modes.
   *
   * @param mode is an enum representing the high-level state of the parser
   */
  @Override
  public void resetMode(Mode mode) {
    insideJavascript = false;
    tag.reset();
    attr.reset();
    value.reset();
    cdataCloseTag.reset();
    valueIndex = 0;
    textInsideUrlValue = false;
    jsParser.reset();

    switch (mode) {
      case HTML:
        currentState = TEXT;
        break;
      case JS:
        currentState = JS_FILE;
        insideJavascript = true;
        break;
      case CSS:
        currentState = CSS_FILE;
        break;
      case HTML_IN_TAG:
        currentState = TAG_SPACE;
        break;
      default:
        throw new IllegalArgumentException("Did not recognize Mode: " +
                                           mode.toString());
    }
  }

  /**
   * Resets the state of the parser to the initial state of parsing HTML.
   */
  public void reset() {
    super.reset();
    resetMode(Mode.HTML);
  }

  /**
   * A specialized directive to tell the parser there is some content
   * that will be inserted here but that it will not get to parse. Used
   * by the template system that may not be able to give some content
   * to the parser but wants it to know there typically will be content
   * inserted at that point.  This is a hint used in corner cases within
   * parsing of HTML attribute names and values where content we do not
   * get to see could affect our parsing and alter our current state.
   *
   * <p>The two cases where {@code #insertText()} affects our parsing are:
   * <ul>
   * <li>We are at the start of the value of a URL-accepting HTML attribute. In
   * that case, we change internal state to no longer be considered at the
   * start of the URL. This may affect what escaping template systems may want
   * to perform on the HTML attribute value. We avoid injecting fake data and
   * hence not modify the current index of the value as determined by
   * {@link #getValueIndex()}</li>
   * <li>We just transitioned from an attribute name to an attribute value
   * (by parsing the separating {@code '='} character). In that case, we
   * change internal state to be now inside a non-quoted HTML attribute
   * value.</li>
   * </ul>
   *
   * @throws ParseException if an unrecoverable error occurred during parsing
   */
  @Override
  public void insertText() throws ParseException {
    // Case: Inside URL attribute value.
    if (getState() == STATE_VALUE
        && getAttributeType() == ATTR_TYPE.URI
        && isUrlStart()) {
      textInsideUrlValue = true;
    }
    // Case: Before parsing any attribute value.
    if (currentState == VALUE) {
      setNextState(VALUE_TEXT);
    }
  }

  @Override
  protected InternalState handleEnterState(InternalState currentState,
                                           InternalState expectedNextState,
                                           char input) {
    InternalState nextState = expectedNextState;
    if (currentState == TAG_NAME) {
      enterTagName();
    } else if (currentState == ATTR) {
      enterAttribute();
    } else if (currentState == TAG_CLOSE) {
      nextState = tagClose(currentState);
    } else if (currentState == CDATA_MAY_CLOSE) {
      enterStateCdataMayClose();
    } else if (currentState == VALUE) {
      enterValue();
    } else
    if (currentState == VALUE_TEXT || currentState == VALUE_Q
        || currentState == VALUE_DQ) {
      enterValueContent();
    }
    return nextState;
  }

  @Override
  protected InternalState handleExitState(InternalState currentState,
                                          InternalState expectedNextState,
                                          char input) {
    InternalState nextState = expectedNextState;
    if (currentState == TAG_NAME) {
      exitTagName();
    } else if (currentState == ATTR) {
      exitAttribute();
    } else if (currentState == CDATA_MAY_CLOSE) {
      nextState = exitStateCdataMayClose(nextState, input);
    } else
    if ((currentState == VALUE_TEXT) || (currentState == VALUE_Q)
        || (currentState == VALUE_DQ)) {
      exitValueContent();
    }
    return nextState;
  }

  @Override
  protected InternalState handleInState(InternalState currentState,
                                        char input) throws ParseException {
    if ((currentState == CDATA_TEXT)
        || (currentState == CDATA_COM_START)
        || (currentState == CDATA_COM_START_DASH)
        || (currentState == CDATA_COM_BODY)
        || (currentState == CDATA_COM_DASH)
        || (currentState == CDATA_COM_DASH_DASH)
        || (currentState == CDATA_LT)
        || (currentState == CDATA_MAY_CLOSE)
        || (currentState == JS_FILE)) {
      inStateCdata(input);
    } else if ((currentState == VALUE_TEXT)
               || (currentState == VALUE_Q)
               || (currentState == VALUE_DQ)) {
      inStateValue(input);
    }
    return currentState;
  }

  /**
   * Invokes recording on all CharacterRecorder objects. Currently we do
   * not check that one and only one of them is recording. I did a fair
   * bit of testing on the C++ parser and was not convinced there is
   * such a guarantee.
   */
  @Override
  protected void record(char input) {
    attr.maybeRecord(input);
    tag.maybeRecord(input);
    value.maybeRecord(input);
    cdataCloseTag.maybeRecord(input);
  }

  /**
   * Starts recording the name of the HTML tag. Called when the parser
   * enters a new tag.
   */
  private void enterTagName() {
    tag.startRecording();
  }

  private void exitTagName() {
    tag.stopRecording();
    String tagString = tag.getContent();
    if (!tagString.isEmpty() && tagString.charAt(0) == '/') {
      tag.reset();
    }
  }

  /**
   * Starts recording the name of the HTML attribute. Called when the parser
   * enters a new HTML attribute.
   */
  private void enterAttribute() {
    attr.startRecording();
  }

  private void exitAttribute() {
    attr.stopRecording();
  }

  /**
   * Tracks the index within the HTML attribute value and initializes
   * the javascript parser for attributes that take javascript.
   *
   * Called when the parser enters a new HTML attribute value.
   */
  private void enterValue() {
    valueIndex = 0;
    textInsideUrlValue = false;
    if (HtmlUtils.isAttributeJavascript(getAttribute())) {
      entityResolver.reset();
      jsParser.reset();
      insideJavascript = true;
    } else {
      insideJavascript = false;
    }
  }

  /**
   * Starts recordning the contents of the attribute value.
   *
   * Called when entering an attribute value.
   */
  private void enterValueContent() {
    value.startRecording();
  }

  /**
   * Stops the recording of the attribute value and exits javascript
   * (in case we were inside it).
   */
  private void exitValueContent() {
    value.stopRecording();
    insideJavascript = false;
  }

  /**
   * Processes javascript after performing entity resolution and updates
   * the position within the attribute value.
   * If the status of the entity resolution is <code>IN_PROGRESS</code>,
   * we don't invoke the javascript parser.
   *
   * <p>Called for every character inside an attribute value.
   *
   * @param input character read
   * @throws ParseException if an unrecoverable error occurred during parsing
   */
  private void inStateValue(char input) throws ParseException {
    valueIndex++;
    if (insideJavascript) {
      EntityResolver.Status status = entityResolver.processChar(input);
      if (status == EntityResolver.Status.COMPLETED) {
        jsParser.parse(entityResolver.getEntity());
        entityResolver.reset();
      } else if (status == EntityResolver.Status.NOT_STARTED) {
        jsParser.parse(input);
      }
    }
  }

  /**
   * Handles the tag it finished reading.
   *
   * <p>For a script tag, it initializes the javascript parser. For all
   * tags that are recognized to have CDATA values
   * (including the script tag), it switches the CDATA state to handle them
   * properly. For code simplification, CDATA and RCDATA sections are
   * treated the same.
   *
   * <p>Called when the parser leaves a tag definition.
   *
   * @param state current state
   * @return state next state, could be the same as current state
   */
  private InternalState tagClose(InternalState state) {
    InternalState nextState = state;
    String tagName = getTag();
    if ("script".equals(tagName)) {
      nextState = CDATA_TEXT;
      jsParser.reset();
      insideJavascript = true;
    } else if ("style".equals(tagName)
                 || "title".equals(tagName)
                 || "textarea".equals(tagName)) {
      nextState = CDATA_TEXT;
      insideJavascript = false;
    }
    return nextState;
  }

  /**
   * Feeds the character to the javascript parser for processing.
   *
   * <p>Called inside CDATA blocks to parse javascript.
   *
   * @param input character read
   * @throws ParseException if an unrecoverable error occurred during parsing
   */
  private void inStateCdata(char input) throws ParseException {
    if (insideJavascript) {
      jsParser.parse(input);
    }
  }

  /**
   * Starts recording. This is so we find the closing tag name in order to
   * know if the tag is going to be closed or not.
   *
   * <p>Called when encountering a '<' character in a CDATA section.
   */
  private void enterStateCdataMayClose() {
    cdataCloseTag.startRecording();
  }

  /**
   * Determines whether to close the tag element, It closes it if it finds
   * the corresponding end tag. Called when reading what could be a
   * closing CDATA tag.
   *
   * @param input the character read
   * @param expectedNextState the expected state to go to next
   *        unless we want to change it here
   * @return the next state to go to
   */
  private InternalState exitStateCdataMayClose(
      InternalState expectedNextState,
      char input) {
    InternalState nextState = expectedNextState;
    cdataCloseTag.stopRecording();
    String cdataCloseTagString = cdataCloseTag.getContent();
    Preconditions.checkState(!cdataCloseTagString.isEmpty()
        && cdataCloseTagString.charAt(0) == '/')// Developer error.

    if (cdataCloseTagString.substring(1).equalsIgnoreCase(getTag())
        && (input == '>' || HtmlUtils.isHtmlSpace(input))) {
      tag.clear();
      insideJavascript = false;
    } else {
      nextState = CDATA_TEXT;
    }
    return nextState;
  }


  // ======================================================= //
  // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE.     //
  // ======================================================= //

  private static void registerMapping(InternalState internalState,
                                      ExternalState externalState) {
    STATE_MAPPING.put(internalState, externalState);
  }

  private static void initializeStateMapping() {
    // Each parser implementation must map the error state appropriately.
    registerMapping(InternalState.INTERNAL_ERROR_STATE, HtmlParser.STATE_ERROR);

    registerMapping(TEXT, HtmlParser.STATE_TEXT);
    registerMapping(TAG_START, HtmlParser.STATE_TAG);
    registerMapping(TAG_NAME, HtmlParser.STATE_TAG);
    registerMapping(DECL_START, HtmlParser.STATE_TEXT);
    registerMapping(DECL_BODY, HtmlParser.STATE_TEXT);
    registerMapping(COM_OPEN, HtmlParser.STATE_TEXT);
    registerMapping(COM_BODY, HtmlParser.STATE_COMMENT);
    registerMapping(COM_DASH, HtmlParser.STATE_COMMENT);
    registerMapping(COM_DASH_DASH, HtmlParser.STATE_COMMENT);
    registerMapping(PI, HtmlParser.STATE_TEXT);
    registerMapping(PI_MAY_END, HtmlParser.STATE_TEXT);
    registerMapping(TAG_SPACE, HtmlParser.STATE_TAG);
    registerMapping(TAG_CLOSE, HtmlParser.STATE_TEXT);
    registerMapping(ATTR, HtmlParser.STATE_ATTR);
    registerMapping(ATTR_SPACE, HtmlParser.STATE_ATTR);
    registerMapping(VALUE, HtmlParser.STATE_VALUE);
    registerMapping(VALUE_TEXT, HtmlParser.STATE_VALUE);
    registerMapping(VALUE_Q_START, HtmlParser.STATE_VALUE);
    registerMapping(VALUE_Q, HtmlParser.STATE_VALUE);
    registerMapping(VALUE_DQ_START, HtmlParser.STATE_VALUE);
    registerMapping(VALUE_DQ, HtmlParser.STATE_VALUE);
    registerMapping(CDATA_COM_START, HtmlParser.STATE_TEXT);
    registerMapping(CDATA_COM_START_DASH, HtmlParser.STATE_TEXT);
    registerMapping(CDATA_COM_BODY, HtmlParser.STATE_TEXT);
    registerMapping(CDATA_COM_DASH, HtmlParser.STATE_TEXT);
    registerMapping(CDATA_COM_DASH_DASH, HtmlParser.STATE_TEXT);
    registerMapping(CDATA_TEXT, HtmlParser.STATE_TEXT);
    registerMapping(CDATA_LT, HtmlParser.STATE_TEXT);
    registerMapping(CDATA_MAY_CLOSE, HtmlParser.STATE_TEXT);
    registerMapping(JS_FILE, HtmlParser.STATE_JS_FILE);
    registerMapping(CSS_FILE, HtmlParser.STATE_CSS_FILE);
  }

  private static void registerTransition(String expression,
                                         InternalState source,
                                         InternalState to) {
    // It seems to silly to go through a StateTableTransition here
    // but it adds extra data checking.
    StateTableTransition stt = new StateTableTransition(expression,
                                                        source, to);
    STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
                              stt.getTo());
  }

  // NOTE: The "[:default:]" transition should be registered before any
  //   other transitions for a given state or it will over-write them.
  private static void initializeParserStateTable() {
    registerTransition("[:default:]", CSS_FILE, CSS_FILE);
    registerTransition("[:default:]", JS_FILE, JS_FILE);
    registerTransition("[:default:]", CDATA_MAY_CLOSE, CDATA_TEXT);
    registerTransition(" \t\n\r", CDATA_MAY_CLOSE, TAG_SPACE);
    registerTransition(">", CDATA_MAY_CLOSE, TEXT);
    registerTransition("A-Za-z0-9/_:-", CDATA_MAY_CLOSE, CDATA_MAY_CLOSE);
    registerTransition("[:default:]", CDATA_LT, CDATA_TEXT);
    registerTransition("!", CDATA_LT, CDATA_COM_START);
    registerTransition("/", CDATA_LT, CDATA_MAY_CLOSE);
    registerTransition("[:default:]", CDATA_TEXT, CDATA_TEXT);
    registerTransition("<", CDATA_TEXT, CDATA_LT);
    registerTransition("[:default:]", CDATA_COM_DASH_DASH, CDATA_COM_BODY);
    registerTransition(">", CDATA_COM_DASH_DASH, CDATA_TEXT);
    registerTransition("-", CDATA_COM_DASH_DASH, CDATA_COM_DASH_DASH);
    registerTransition("[:default:]", CDATA_COM_DASH, CDATA_COM_BODY);
    registerTransition("-", CDATA_COM_DASH, CDATA_COM_DASH_DASH);
    registerTransition("[:default:]", CDATA_COM_BODY, CDATA_COM_BODY);
    registerTransition("-", CDATA_COM_BODY, CDATA_COM_DASH);
    registerTransition("[:default:]", CDATA_COM_START_DASH, CDATA_TEXT);
    registerTransition("-", CDATA_COM_START_DASH, CDATA_COM_BODY);
    registerTransition("[:default:]", CDATA_COM_START, CDATA_TEXT);
    registerTransition("-", CDATA_COM_START, CDATA_COM_START_DASH);
    registerTransition("[:default:]", VALUE_DQ, VALUE_DQ);
    registerTransition("\"", VALUE_DQ, TAG_SPACE);
    registerTransition("[:default:]", VALUE_DQ_START, VALUE_DQ);
    registerTransition("\"", VALUE_DQ_START, TAG_SPACE);
    registerTransition("[:default:]", VALUE_Q, VALUE_Q);
    registerTransition("\'", VALUE_Q, TAG_SPACE);
    registerTransition("[:default:]", VALUE_Q_START, VALUE_Q);
    registerTransition("\'", VALUE_Q_START, TAG_SPACE);
    registerTransition("[:default:]", VALUE_TEXT, VALUE_TEXT);
    registerTransition(" \t\n\r", VALUE_TEXT, TAG_SPACE);
    registerTransition(">", VALUE_TEXT, TAG_CLOSE);
    registerTransition("[:default:]", VALUE, VALUE_TEXT);
    registerTransition(">", VALUE, TAG_CLOSE);
    registerTransition(" \t\n\r", VALUE, VALUE);
    registerTransition("\"", VALUE, VALUE_DQ_START);
    registerTransition("\'", VALUE, VALUE_Q_START);
    registerTransition("=", ATTR_SPACE, VALUE);
    registerTransition("/", ATTR_SPACE, TAG_SPACE);
    registerTransition("A-Za-z0-9_:-", ATTR_SPACE, ATTR);
    registerTransition(" \t\n\r", ATTR_SPACE, ATTR_SPACE);
    registerTransition(">", ATTR_SPACE, TAG_CLOSE);
    registerTransition(" \t\n\r", ATTR, ATTR_SPACE);
    registerTransition("=", ATTR, VALUE);
    registerTransition("/", ATTR, TAG_SPACE);
    registerTransition(">", ATTR, TAG_CLOSE);
    registerTransition("A-Za-z0-9_:.-", ATTR, ATTR);
    registerTransition("[:default:]", TAG_CLOSE, TEXT);
    registerTransition("<", TAG_CLOSE, TAG_START);
    registerTransition("/", TAG_SPACE, TAG_SPACE);
    registerTransition("A-Za-z0-9_:-", TAG_SPACE, ATTR);
    registerTransition(" \t\n\r", TAG_SPACE, TAG_SPACE);
    registerTransition(">", TAG_SPACE, TAG_CLOSE);
    registerTransition("[:default:]", PI_MAY_END, PI);
    registerTransition(">", PI_MAY_END, TEXT);
    registerTransition("[:default:]", PI, PI);
    registerTransition("?", PI, PI_MAY_END);
    registerTransition("[:default:]", COM_DASH_DASH, COM_BODY);
    registerTransition(">", COM_DASH_DASH, TEXT);
    registerTransition("-", COM_DASH_DASH, COM_DASH_DASH);
    registerTransition("[:default:]", COM_DASH, COM_BODY);
    registerTransition("-", COM_DASH, COM_DASH_DASH);
    registerTransition("[:default:]", COM_BODY, COM_BODY);
    registerTransition("-", COM_BODY, COM_DASH);
    registerTransition("[:default:]", COM_OPEN, TEXT);
    registerTransition("-", COM_OPEN, COM_BODY);
    registerTransition("[:default:]", DECL_BODY, DECL_BODY);
    registerTransition(">", DECL_BODY, TEXT);
    registerTransition("[:default:]", DECL_START, DECL_BODY);
    registerTransition(">", DECL_START, TEXT);
    registerTransition("-", DECL_START, COM_OPEN);
    registerTransition(">", TAG_NAME, TAG_CLOSE);
    registerTransition(" \t\n\r", TAG_NAME, TAG_SPACE);
    registerTransition("A-Za-z0-9/_:-", TAG_NAME, TAG_NAME);

    // Manual change to remain in-sync with CL 10597850 in C HtmlParser.
    registerTransition("[:default:]", TAG_START, TEXT);
    registerTransition("<", TAG_START, TAG_START);
    // End of manual change.

    registerTransition("!", TAG_START, DECL_START);
    registerTransition("?", TAG_START, PI);
    registerTransition("A-Za-z0-9/_:-", TAG_START, TAG_NAME);
    registerTransition("[:default:]", TEXT, TEXT);
    registerTransition("<", TEXT, TAG_START);
  }
}
TOP

Related Classes of com.google.streamhtmlparser.impl.HtmlParserImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.