Package com.googlecode.html

Source Code of com.googlecode.html.HTMLScanner$ContentScanner

/*
* Copyright 2002-2009 Andy Clark, Marc Guillemot
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/

package com.googlecode.html;

import java.io.EOFException;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.BitSet;
import java.util.Stack;

import org.apache.xerces.util.EncodingMap;
import org.apache.xerces.util.NamespaceSupport;
import org.apache.xerces.util.URI;
import org.apache.xerces.util.XMLAttributesImpl;
import org.apache.xerces.util.XMLResourceIdentifierImpl;
import org.apache.xerces.util.XMLStringBuffer;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLDocumentHandler;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLComponentManager;
import org.apache.xerces.xni.parser.XMLConfigurationException;
import org.apache.xerces.xni.parser.XMLDocumentScanner;
import org.apache.xerces.xni.parser.XMLInputSource;

import com.googlecode.html.xercesbridge.XercesBridge;

/**
* A simple HTML scanner. This scanner makes no attempt to balance tags or fix other problems in the
* source document — it just scans what it can and generates XNI document "events", ignoring
* errors of all kinds.
* <p>
* This component recognizes the following features:
* <ul>
* <li>http://cyberneko.org/html/features/augmentations
* <li>http://cyberneko.org/html/features/report-errors
* <li>http://apache.org/xml/features/scanner/notify-char-refs
* <li>http://apache.org/xml/features/scanner/notify-builtin-refs
* <li>http://cyberneko.org/html/features/scanner/notify-builtin-refs
* <li>http://cyberneko.org/html/features/scanner/fix-mswindows-refs
* <li>http://cyberneko.org/html/features/scanner/script/strip-cdata-delims
* <li>http://cyberneko.org/html/features/scanner/script/strip-comment-delims
* <li>http://cyberneko.org/html/features/scanner/style/strip-cdata-delims
* <li>http://cyberneko.org/html/features/scanner/style/strip-comment-delims
* <li>http://cyberneko.org/html/features/scanner/ignore-specified-charset
* <li>http://cyberneko.org/html/features/scanner/cdata-sections
* <li>http://cyberneko.org/html/features/override-doctype
* <li>http://cyberneko.org/html/features/insert-doctype
* <li>http://cyberneko.org/html/features/parse-noscript-content
* <li>http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe
* </ul>
* <p>
* This component recognizes the following properties:
* <ul>
* <li>http://cyberneko.org/html/properties/names/elems
* <li>http://cyberneko.org/html/properties/names/attrs
* <li>http://cyberneko.org/html/properties/default-encoding
* <li>http://cyberneko.org/html/properties/error-reporter
* <li>http://cyberneko.org/html/properties/doctype/pubid
* <li>http://cyberneko.org/html/properties/doctype/sysid
* </ul>
*
* @see HTMLElements
* @see HTMLEntities
*
* @author Andy Clark
* @author Marc Guillemot
* @author Ahmed Ashour
*
* @version $Id: HTMLScanner.java,v 1.19 2005/06/14 05:52:37 andyc Exp $
*/
public class HTMLScanner implements XMLDocumentScanner, XMLLocator, HTMLComponent {

   //
   // Constants
   //

   // doctype info: HTML 4.01 strict

   /**
    * The primary HTML document scanner.
    *
    * @author Andy Clark
    */
   public class ContentScanner implements Scanner {

      //
      // Data
      //

      // temp vars

      /** Attributes. */
      private final XMLAttributesImpl fAttributes = new XMLAttributesImpl();

      /** A qualified name. */
      private final QName fQName = new QName();

      //
      // Scanner methods
      //

      /** Scan. */
      public boolean scan(boolean complete) throws IOException {
         boolean next;
         do {
            try {
               next = false;
               switch (fScannerState) {
                  case STATE_CONTENT: {
                     fBeginLineNumber = fCurrentEntity.getLineNumber();
                     fBeginColumnNumber = fCurrentEntity.getColumnNumber();
                     fBeginCharacterOffset = fCurrentEntity.getCharacterOffset();
                     int c = fCurrentEntity.read();
                     if (c == '<') {
                        setScannerState(STATE_MARKUP_BRACKET);
                        next = true;
                     } else if (c == '&') {
                        scanEntityRef(fStringBuffer, true);
                     } else if (c == -1) {
                        throw new EOFException();
                     } else {
                        fCurrentEntity.rewind();
                        scanCharacters();
                     }
                     break;
                  }
                  case STATE_MARKUP_BRACKET: {
                     int c = fCurrentEntity.read();
                     if (c == '!') {
                        if (skip("--", false)) {
                           scanComment();
                        } else if (skip("[CDATA[", false)) {
                           scanCDATA();
                        } else if (skip("DOCTYPE", false)) {
                           scanDoctype();
                        } else {
                           if (fReportErrors) {
                              fErrorReporter.reportError("HTML1002", null);
                           }
                           skipMarkup(true);
                        }
                     } else if (c == '?') {
                        scanPI();
                     } else if (c == '/') {
                        scanEndElement();
                     } else if (c == -1) {
                        if (fReportErrors) {
                           fErrorReporter.reportError("HTML1003", null);
                        }
                        if (fDocumentHandler != null && fElementCount >= fElementDepth) {
                           fStringBuffer.clear();
                           fStringBuffer.append('<');
                           fDocumentHandler.characters(fStringBuffer, null);
                        }
                        throw new EOFException();
                     } else {
                        fCurrentEntity.rewind();
                        fElementCount++;
                        fSingleBoolean[0] = false;
                        final String ename = scanStartElement(fSingleBoolean);
                        final String enameLC = ename == null ? null : ename.toLowerCase();
                        fBeginLineNumber = fCurrentEntity.getLineNumber();
                        fBeginColumnNumber = fCurrentEntity.getColumnNumber();
                        fBeginCharacterOffset = fCurrentEntity.getCharacterOffset();
                        if ("script".equals(enameLC)) {
                           scanScriptContent();
                        } else if (!fAllowSelfclosingIframe && "iframe".equals(enameLC)) {
                           scanUntilEndTag("iframe");
                        } else if (!fParseNoScriptContent && "noscript".equals(enameLC)) {
                           scanUntilEndTag("noscript");
                        } else if (!fParseNoFramesContent && "noframes".equals(enameLC)) {
                           scanUntilEndTag("noframes");
                        } else if (ename != null && !fSingleBoolean[0]
                                 && HTMLElements.getElement(enameLC).isSpecial()
                                 && (!ename.equalsIgnoreCase("TITLE") || isEnded(enameLC))) {
                           setScanner(fSpecialScanner.setElementName(ename));
                           setScannerState(STATE_CONTENT);
                           return true;
                        }
                     }
                     setScannerState(STATE_CONTENT);
                     break;
                  }
                  case STATE_START_DOCUMENT: {
                     if (fDocumentHandler != null && fElementCount >= fElementDepth) {
                        if (DEBUG_CALLBACKS) {
                           System.out.println("startDocument()");
                        }
                        XMLLocator locator = HTMLScanner.this;
                        String encoding = fIANAEncoding;
                        Augmentations augs = locationAugs();
                        NamespaceContext nscontext = new NamespaceSupport();
                        XercesBridge.getInstance().XMLDocumentHandler_startDocument(
                                 fDocumentHandler, locator, encoding, nscontext, augs);
                     }
                     if (fInsertDoctype && fDocumentHandler != null) {
                        String root = HTMLElements.getElement(HTMLElements.HTML).name;
                        root = modifyName(root, fNamesElems);
                        String pubid = fDoctypePubid;
                        String sysid = fDoctypeSysid;
                        fDocumentHandler.doctypeDecl(root, pubid, sysid, synthesizedAugs());
                     }
                     setScannerState(STATE_CONTENT);
                     break;
                  }
                  case STATE_END_DOCUMENT: {
                     if (fDocumentHandler != null && fElementCount >= fElementDepth && complete) {
                        if (DEBUG_CALLBACKS) {
                           System.out.println("endDocument()");
                        }
                        fEndLineNumber = fCurrentEntity.getLineNumber();
                        fEndColumnNumber = fCurrentEntity.getColumnNumber();
                        fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
                        fDocumentHandler.endDocument(locationAugs());
                     }
                     return false;
                  }
                  default: {
                     throw new RuntimeException("unknown scanner state: " + fScannerState);
                  }
               }
            } catch (EOFException e) {
               if (fCurrentEntityStack.empty()) {
                  setScannerState(STATE_END_DOCUMENT);
               } else {
                  fCurrentEntity = (CurrentEntity) fCurrentEntityStack.pop();
               }
               next = true;
            }
         } while (next || complete);
         return true;
      } // scan(boolean):boolean

      /** Adds location augmentations to the specified attribute. */
      protected void addLocationItem(XMLAttributes attributes, int index) {
         fEndLineNumber = fCurrentEntity.getLineNumber();
         fEndColumnNumber = fCurrentEntity.getColumnNumber();
         fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
         LocationItem locationItem = new LocationItem();
         locationItem.setValues(fBeginLineNumber, fBeginColumnNumber, fBeginCharacterOffset,
                  fEndLineNumber, fEndColumnNumber, fEndCharacterOffset);
         Augmentations augs = attributes.getAugmentations(index);
         augs.putItem(AUGMENTATIONS, locationItem);
      } // addLocationItem(XMLAttributes,int)

      /**
       * Reads the next characters WITHOUT impacting the buffer content up to current offset.
       *
       * @param len the number of characters to read
       * @return the read string (length may be smaller if EOF is encountered)
       */
      protected String nextContent(int len) throws IOException {
         final int originalOffset = fCurrentEntity.offset;
         final int originalColumnNumber = fCurrentEntity.getColumnNumber();
         final int originalCharacterOffset = fCurrentEntity.getCharacterOffset();

         char[] buff = new char[len];
         int nbRead = 0;
         for (nbRead = 0; nbRead < len; ++nbRead) {
            // read() should not clear the buffer
            if (fCurrentEntity.offset == fCurrentEntity.length) {
               if (fCurrentEntity.length == fCurrentEntity.buffer.length) {
                  fCurrentEntity.load(fCurrentEntity.buffer.length);
               } else { // everything was already loaded
                  break;
               }
            }

            int c = fCurrentEntity.read();
            if (c == -1) {
               break;
            } else {
               buff[nbRead] = (char) c;
            }
         }
         fCurrentEntity.restorePosition(originalOffset, originalColumnNumber,
                  originalCharacterOffset);
         return new String(buff, 0, nbRead);
      }

      /**
       * Scans a real attribute.
       *
       * @param attributes The list of attributes.
       * @param empty Is used for a second return value to indicate whether the start element tag is
       *           empty (e.g. "/&gt;").
       */
      protected boolean scanAttribute(XMLAttributesImpl attributes, boolean[] empty)
               throws IOException {
         return scanAttribute(attributes, empty, '/');
      } // scanAttribute(XMLAttributesImpl,boolean[]):boolean

      //
      // Protected methods
      //

      /**
       * Scans an attribute, pseudo or real.
       *
       * @param attributes The list of attributes.
       * @param empty Is used for a second return value to indicate whether the start element tag is
       *           empty (e.g. "/&gt;").
       * @param endc The end character that appears before the closing angle bracket ('>').
       */
      protected boolean scanAttribute(XMLAttributesImpl attributes, boolean[] empty, char endc)
               throws IOException {
         boolean skippedSpaces = skipSpaces();
         fBeginLineNumber = fCurrentEntity.getLineNumber();
         fBeginColumnNumber = fCurrentEntity.getColumnNumber();
         fBeginCharacterOffset = fCurrentEntity.getCharacterOffset();
         int c = fCurrentEntity.read();
         if (c == -1) {
            if (fReportErrors) {
               fErrorReporter.reportError("HTML1007", null);
            }
            return false;
         } else if (c == '>') {
            return false;
         } else if (c == '<') {
            fCurrentEntity.rewind();
            return false;
         }
         fCurrentEntity.rewind();
         String aname = scanName();
         if (aname == null) {
            if (fReportErrors) {
               fErrorReporter.reportError("HTML1011", null);
            }
            empty[0] = skipMarkup(false);
            return false;
         }
         if (!skippedSpaces && fReportErrors) {
            fErrorReporter.reportError("HTML1013", new Object[]{aname});
         }
         aname = modifyName(aname, fNamesAttrs);
         skipSpaces();
         c = fCurrentEntity.read();
         if (c == -1) {
            if (fReportErrors) {
               fErrorReporter.reportError("HTML1007", null);
            }
            throw new EOFException();
         }
         if (c == '/' || c == '>') {
            fQName.setValues(null, aname, aname, null);
            attributes.addAttribute(fQName, "CDATA", "");
            attributes.setSpecified(attributes.getLength() - 1, true);
            if (fAugmentations) {
               addLocationItem(attributes, attributes.getLength() - 1);
            }
            if (c == '/') {
               fCurrentEntity.rewind();
               empty[0] = skipMarkup(false);
            }
            return false;
         }
         /***
          * // REVISIT: [Q] Why is this still here? -Ac if (c == '/' || c == '>') { if (c == '/') {
          * fCurrentEntity.offset--; fCurrentEntity.columnNumber--; empty[0] = skipMarkup(false); }
          * fQName.setValues(null, aname, aname, null); attributes.addAttribute(fQName, "CDATA",
          * ""); attributes.setSpecified(attributes.getLength()-1, true); if (fAugmentations) {
          * addLocationItem(attributes, attributes.getLength() - 1); } return false; } /
          ***/
         if (c == '=') {
            skipSpaces();
            c = fCurrentEntity.read();
            if (c == -1) {
               if (fReportErrors) {
                  fErrorReporter.reportError("HTML1007", null);
               }
               throw new EOFException();
            }
            // Xiaowei/Ac: Fix for <a href=/cgi-bin/myscript>...</a>
            if (c == '>') {
               fQName.setValues(null, aname, aname, null);
               attributes.addAttribute(fQName, "CDATA", "");
               attributes.setSpecified(attributes.getLength() - 1, true);
               if (fAugmentations) {
                  addLocationItem(attributes, attributes.getLength() - 1);
               }
               return false;
            }
            fStringBuffer.clear();
            fNonNormAttr.clear();
            if (c != '\'' && c != '"') {
               fCurrentEntity.rewind();
               while (true) {
                  c = fCurrentEntity.read();
                  // Xiaowei/Ac: Fix for <a href=/broken/>...</a>
                  if (Character.isWhitespace((char) c) || c == '>') {
                     // fCharOffset--;
                     fCurrentEntity.rewind();
                     break;
                  }
                  if (c == -1) {
                     if (fReportErrors) {
                        fErrorReporter.reportError("HTML1007", null);
                     }
                     throw new EOFException();
                  }
                  if (c == '&') {
                     int ce = scanEntityRef(fStringBuffer2, false);
                     if (ce != -1) {
                        fStringBuffer.append((char) ce);
                     } else {
                        fStringBuffer.append(fStringBuffer2);
                     }
                     fNonNormAttr.append(fStringBuffer2);
                  } else {
                     fStringBuffer.append((char) c);
                     fNonNormAttr.append((char) c);
                  }
               }
               fQName.setValues(null, aname, aname, null);
               String avalue = fStringBuffer.toString();
               attributes.addAttribute(fQName, "CDATA", avalue);

               int lastattr = attributes.getLength() - 1;
               attributes.setSpecified(lastattr, true);
               attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString());
               if (fAugmentations) {
                  addLocationItem(attributes, attributes.getLength() - 1);
               }
               return true;
            }
            char quote = (char) c;
            boolean isStart = true;
            boolean prevSpace = false;
            do {
               boolean acceptSpace = !fNormalizeAttributes || (!isStart && !prevSpace);
               c = fCurrentEntity.read();
               if (c == -1) {
                  if (fReportErrors) {
                     fErrorReporter.reportError("HTML1007", null);
                  }
                  break;
                  // throw new EOFException();
               }
               if (c == '&') {
                  isStart = false;
                  int ce = scanEntityRef(fStringBuffer2, false);
                  if (ce != -1) {
                     fStringBuffer.append((char) ce);
                  } else {
                     fStringBuffer.append(fStringBuffer2);
                  }
                  fNonNormAttr.append(fStringBuffer2);
               } else if (c == ' ' || c == '\t') {
                  if (acceptSpace) {
                     fStringBuffer.append(fNormalizeAttributes ? ' ' : (char) c);
                  }
                  fNonNormAttr.append((char) c);
               } else if (c == '\r' || c == '\n') {
                  if (c == '\r') {
                     int c2 = fCurrentEntity.read();
                     if (c2 != '\n') {
                        fCurrentEntity.rewind();
                     } else {
                        fNonNormAttr.append('\r');
                        c = c2;
                     }
                  }
                  if (acceptSpace) {
                     fStringBuffer.append(fNormalizeAttributes ? ' ' : '\n');
                  }
                  fCurrentEntity.incLine();
                  fNonNormAttr.append((char) c);
               } else if (c != quote) {
                  isStart = false;
                  fStringBuffer.append((char) c);
                  fNonNormAttr.append((char) c);
               }
               prevSpace = c == ' ' || c == '\t' || c == '\r' || c == '\n';
               isStart = isStart && prevSpace;
            } while (c != quote);

            if (fNormalizeAttributes && fStringBuffer.length > 0) {
               // trailing whitespace already normalized to single space
               if (fStringBuffer.ch[fStringBuffer.length - 1] == ' ') {
                  fStringBuffer.length--;
               }
            }

            fQName.setValues(null, aname, aname, null);
            String avalue = fStringBuffer.toString();
            attributes.addAttribute(fQName, "CDATA", avalue);

            int lastattr = attributes.getLength() - 1;
            attributes.setSpecified(lastattr, true);
            attributes.setNonNormalizedValue(lastattr, fNonNormAttr.toString());
            if (fAugmentations) {
               addLocationItem(attributes, attributes.getLength() - 1);
            }
         } else {
            fQName.setValues(null, aname, aname, null);
            attributes.addAttribute(fQName, "CDATA", "");
            attributes.setSpecified(attributes.getLength() - 1, true);
            fCurrentEntity.rewind();
            if (fAugmentations) {
               addLocationItem(attributes, attributes.getLength() - 1);
            }
         }
         return true;
      } // scanAttribute(XMLAttributesImpl):boolean

      /** Scans a CDATA section. */
      protected void scanCDATA() throws IOException {
         fCurrentEntity.debugBufferIfNeeded("(scanCDATA: ");
         fStringBuffer.clear();
         if (fCDATASections) {
            if (fDocumentHandler != null && fElementCount >= fElementDepth) {
               fEndLineNumber = fCurrentEntity.getLineNumber();
               fEndColumnNumber = fCurrentEntity.getColumnNumber();
               fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
               if (DEBUG_CALLBACKS) {
                  System.out.println("startCDATA()");
               }
               fDocumentHandler.startCDATA(locationAugs());
            }
         } else {
            fStringBuffer.append("[CDATA[");
         }
         boolean eof = scanMarkupContent(fStringBuffer, ']');
         if (!fCDATASections) {
            fStringBuffer.append("]]");
         }
         if (fDocumentHandler != null && fElementCount >= fElementDepth) {
            fEndLineNumber = fCurrentEntity.getLineNumber();
            fEndColumnNumber = fCurrentEntity.getColumnNumber();
            fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
            if (fCDATASections) {
               if (DEBUG_CALLBACKS) {
                  System.out.println("characters(" + fStringBuffer + ")");
               }
               fDocumentHandler.characters(fStringBuffer, locationAugs());
               if (DEBUG_CALLBACKS) {
                  System.out.println("endCDATA()");
               }
               fDocumentHandler.endCDATA(locationAugs());
            } else {
               if (DEBUG_CALLBACKS) {
                  System.out.println("comment(" + fStringBuffer + ")");
               }
               fDocumentHandler.comment(fStringBuffer, locationAugs());
            }
         }
         fCurrentEntity.debugBufferIfNeeded(")scanCDATA: ");
         if (eof) {
            throw new EOFException();
         }
      } // scanCDATA()

      /** Scans characters. */
      protected void scanCharacters() throws IOException {
         fCurrentEntity.debugBufferIfNeeded("(scanCharacters: ");
         fStringBuffer.clear();
         while (true) {
            int newlines = skipNewlines();
            if (newlines == 0 && fCurrentEntity.offset == fCurrentEntity.length) {
               fCurrentEntity.debugBufferIfNeeded(")scanCharacters: ");
               break;
            }
            char c;
            int offset = fCurrentEntity.offset - newlines;
            for (int i = offset; i < fCurrentEntity.offset; i++) {
               fCurrentEntity.buffer[i] = '\n';
            }
            while (fCurrentEntity.hasNext()) {
               c = fCurrentEntity.getNextChar();
               if (c == '<' || c == '&' || c == '\n' || c == '\r') {
                  fCurrentEntity.rewind();
                  break;
               }
            }
            if (fCurrentEntity.offset > offset && fDocumentHandler != null
                     && fElementCount >= fElementDepth) {
               if (DEBUG_CALLBACKS) {
                  final XMLString xmlString = new XMLString(fCurrentEntity.buffer, offset,
                           fCurrentEntity.offset - offset);
                  System.out.println("characters(" + xmlString + ")");
               }
               fEndLineNumber = fCurrentEntity.getLineNumber();
               fEndColumnNumber = fCurrentEntity.getColumnNumber();
               fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
               fStringBuffer.append(fCurrentEntity.buffer, offset, fCurrentEntity.offset - offset);
            }
            fCurrentEntity.debugBufferIfNeeded(")scanCharacters: ");

            boolean hasNext = fCurrentEntity.offset < fCurrentEntity.buffer.length;
            int next = hasNext ? fCurrentEntity.getCurrentChar() : -1;

            if (next == '&' || next == '<' || next == -1) {
               break;
            }

         } // end while

         if (fStringBuffer.length != 0) {
            fDocumentHandler.characters(fStringBuffer, locationAugs());
         }

      } // scanCharacters()

      /** Scans a comment. */
      protected void scanComment() throws IOException {
         fCurrentEntity.debugBufferIfNeeded("(scanComment: ");
         fEndLineNumber = fCurrentEntity.getLineNumber();
         fEndColumnNumber = fCurrentEntity.getColumnNumber();
         fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
         XMLStringBuffer buffer = new XMLStringBuffer();
         boolean eof = scanMarkupContent(buffer, '-');
         // no --> found, comment with end only with >
         if (eof) {
            fCurrentEntity.resetBuffer(buffer, fEndLineNumber, fEndColumnNumber,
                     fEndCharacterOffset);
            buffer = new XMLStringBuffer(); // take a new one to avoid
                                            // interactions
            while (true) {
               int c = fCurrentEntity.read();
               if (c == -1) {
                  if (fReportErrors) {
                     fErrorReporter.reportError("HTML1007", null);
                  }
                  eof = true;
                  break;
               } else if (c != '>') {
                  buffer.append((char) c);
                  continue;
               } else if (c == '\n' || c == '\r') {
                  fCurrentEntity.rewind();
                  int newlines = skipNewlines();
                  for (int i = 0; i < newlines; i++) {
                     buffer.append('\n');
                  }
                  continue;
               }
               eof = false;
               break;
            }
         }
         if (fDocumentHandler != null && fElementCount >= fElementDepth) {
            if (DEBUG_CALLBACKS) {
               System.out.println("comment(" + buffer + ")");
            }
            fEndLineNumber = fCurrentEntity.getLineNumber();
            fEndColumnNumber = fCurrentEntity.getColumnNumber();
            fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
            fDocumentHandler.comment(buffer, locationAugs());
         }
         fCurrentEntity.debugBufferIfNeeded(")scanComment: ");
         if (eof) {
            throw new EOFException();
         }
      } // scanComment()

      /** Scans an end element. */
      protected void scanEndElement() throws IOException {
         String ename = scanName();
         if (fReportErrors && ename == null) {
            fErrorReporter.reportError("HTML1012", null);
         }
         skipMarkup(false);
         if (ename != null) {
            ename = modifyName(ename, fNamesElems);
            if (fDocumentHandler != null && fElementCount >= fElementDepth) {
               fQName.setValues(null, ename, ename, null);
               if (DEBUG_CALLBACKS) {
                  System.out.println("endElement(" + fQName + ")");
               }
               fEndLineNumber = fCurrentEntity.getLineNumber();
               fEndColumnNumber = fCurrentEntity.getColumnNumber();
               fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
               fDocumentHandler.endElement(fQName, locationAugs());
            }
         }
      } // scanEndElement()

      /** Scans markup content. */
      protected boolean scanMarkupContent(XMLStringBuffer buffer, char cend) throws IOException {
         int c = -1;
         OUTER : while (true) {
            c = fCurrentEntity.read();
            if (c == cend) {
               int count = 1;
               while (true) {
                  c = fCurrentEntity.read();
                  if (c == cend) {
                     count++;
                     continue;
                  }
                  break;
               }
               if (c == -1) {
                  if (fReportErrors) {
                     fErrorReporter.reportError("HTML1007", null);
                  }
                  break OUTER;
               }
               if (count < 2) {
                  buffer.append(cend);
                  // if (c != -1) {
                  fCurrentEntity.rewind();
                  // }
                  continue;
               }
               if (c != '>') {
                  for (int i = 0; i < count; i++) {
                     buffer.append(cend);
                  }
                  fCurrentEntity.rewind();
                  continue;
               }
               for (int i = 0; i < count - 2; i++) {
                  buffer.append(cend);
               }
               break;
            } else if (c == '\n' || c == '\r') {
               fCurrentEntity.rewind();
               int newlines = skipNewlines();
               for (int i = 0; i < newlines; i++) {
                  buffer.append('\n');
               }
               continue;
            } else if (c == -1) {
               if (fReportErrors) {
                  fErrorReporter.reportError("HTML1007", null);
               }
               break;
            }
            buffer.append((char) c);
         }
         return c == -1;
      } // scanMarkupContent(XMLStringBuffer,char):boolean

      /** Scans a processing instruction. */
      protected void scanPI() throws IOException {
         fCurrentEntity.debugBufferIfNeeded("(scanPI: ");
         if (fReportErrors) {
            fErrorReporter.reportWarning("HTML1008", null);
         }

         // scan processing instruction
         String target = scanName();
         if (target != null && !target.equalsIgnoreCase("xml")) {
            while (true) {
               int c = fCurrentEntity.read();
               if (c == '\r' || c == '\n') {
                  if (c == '\r') {
                     c = fCurrentEntity.read();
                     if (c != '\n') {
                        fCurrentEntity.offset--;
                        fCurrentEntity.characterOffset_--;
                     }
                  }
                  fCurrentEntity.incLine();
                  continue;
               }
               if (c == -1) {
                  break;
               }
               if (c != ' ' && c != '\t') {
                  fCurrentEntity.rewind();
                  break;
               }
            }
            fStringBuffer.clear();
            while (true) {
               int c = fCurrentEntity.read();
               if (c == '?' || c == '/') {
                  char c0 = (char) c;
                  c = fCurrentEntity.read();
                  if (c == '>') {
                     break;
                  } else {
                     fStringBuffer.append(c0);
                     fCurrentEntity.rewind();
                     continue;
                  }
               } else if (c == '\r' || c == '\n') {
                  fStringBuffer.append('\n');
                  if (c == '\r') {
                     c = fCurrentEntity.read();
                     if (c != '\n') {
                        fCurrentEntity.offset--;
                        fCurrentEntity.characterOffset_--;
                     }
                  }
                  fCurrentEntity.incLine();
                  continue;
               } else if (c == -1) {
                  break;
               } else {
                  fStringBuffer.append((char) c);
               }
            }
            XMLString data = fStringBuffer;
            if (fDocumentHandler != null) {
               fEndLineNumber = fCurrentEntity.getLineNumber();
               fEndColumnNumber = fCurrentEntity.getColumnNumber();
               fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
               fDocumentHandler.processingInstruction(target, data, locationAugs());
            }
         }

         // scan xml/text declaration
         else {
            int beginLineNumber = fBeginLineNumber;
            int beginColumnNumber = fBeginColumnNumber;
            int beginCharacterOffset = fBeginCharacterOffset;
            fAttributes.removeAllAttributes();
            int aindex = 0;
            while (scanPseudoAttribute(fAttributes)) {
               // if we haven't scanned a value, remove the entry as values have
               // special signification
               if (fAttributes.getValue(aindex).length() == 0) {
                  fAttributes.removeAttributeAt(aindex);
               } else {
                  fAttributes.getName(aindex, fQName);
                  fQName.rawname = fQName.rawname.toLowerCase();
                  fAttributes.setName(aindex, fQName);
                  aindex++;
               }
            }
            if (fDocumentHandler != null) {
               String version = fAttributes.getValue("version");
               String encoding = fAttributes.getValue("encoding");
               String standalone = fAttributes.getValue("standalone");

               // if the encoding is successfully changed, the stream will be
               // processed again
               // with the right encoding an we will come here again but without
               // need to change the encoding
               final boolean xmlDeclNow = fIgnoreSpecifiedCharset || !changeEncoding(encoding);
               if (xmlDeclNow) {
                  fBeginLineNumber = beginLineNumber;
                  fBeginColumnNumber = beginColumnNumber;
                  fBeginCharacterOffset = beginCharacterOffset;
                  fEndLineNumber = fCurrentEntity.getLineNumber();
                  fEndColumnNumber = fCurrentEntity.getColumnNumber();
                  fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
                  fDocumentHandler.xmlDecl(version, encoding, standalone, locationAugs());
               }
            }
         }

         fCurrentEntity.debugBufferIfNeeded(")scanPI: ");
      } // scanPI()

      /**
       * Scans a pseudo attribute.
       *
       * @param attributes The list of attributes.
       */
      protected boolean scanPseudoAttribute(XMLAttributesImpl attributes) throws IOException {
         return scanAttribute(attributes, fSingleBoolean, '?');
      } // scanPseudoAttribute(XMLAttributesImpl):boolean

      /**
       * Scans a start element.
       *
       * @param empty Is used for a second return value to indicate whether the start element tag is
       *           empty (e.g. "/&gt;").
       */
      protected String scanStartElement(boolean[] empty) throws IOException {
         String ename = scanName();
         int length = ename != null ? ename.length() : 0;
         int c = length > 0 ? ename.charAt(0) : -1;
         if (length == 0 || !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
            if (fReportErrors) {
               fErrorReporter.reportError("HTML1009", null);
            }
            if (fDocumentHandler != null && fElementCount >= fElementDepth) {
               fStringBuffer.clear();
               fStringBuffer.append('<');
               if (length > 0) {
                  fStringBuffer.append(ename);
               }
               fDocumentHandler.characters(fStringBuffer, null);
            }
            return null;
         }
         ename = modifyName(ename, fNamesElems);
         fAttributes.removeAllAttributes();
         int beginLineNumber = fBeginLineNumber;
         int beginColumnNumber = fBeginColumnNumber;
         int beginCharacterOffset = fBeginCharacterOffset;
         while (scanAttribute(fAttributes, empty)) {
            // do nothing
         }
         fBeginLineNumber = beginLineNumber;
         fBeginColumnNumber = beginColumnNumber;
         fBeginCharacterOffset = beginCharacterOffset;
         if (fByteStream != null && fElementDepth == -1) {
            if (ename.equalsIgnoreCase("META")) {
               if (DEBUG_CHARSET) {
                  System.out.println("+++ <META>");
               }
               String httpEquiv = getValue(fAttributes, "http-equiv");
               if (httpEquiv != null && httpEquiv.equalsIgnoreCase("content-type")) {
                  if (DEBUG_CHARSET) {
                     System.out.println("+++ @content-type: \"" + httpEquiv + '"');
                  }
                  String content = getValue(fAttributes, "content");
                  if (content != null) {
                     content = removeSpaces(content);
                     int index1 = content.toLowerCase().indexOf("charset=");
                     if (index1 != -1 && !fIgnoreSpecifiedCharset) {
                        final int index2 = content.indexOf(';', index1);
                        final String charset = index2 != -1 ? content.substring(index1 + 8, index2)
                                 : content.substring(index1 + 8);
                        changeEncoding(charset);
                     }
                  }
               }
            } else if (ename.equalsIgnoreCase("BODY")) {
               fByteStream.clear();
               fByteStream = null;
            } else {
               HTMLElements.Element element = HTMLElements.getElement(ename);
               if (element.parent != null && element.parent.length > 0) {
                  if (element.parent[0].code == HTMLElements.BODY) {
                     fByteStream.clear();
                     fByteStream = null;
                  }
               }
            }
         }
         if (fDocumentHandler != null && fElementCount >= fElementDepth) {
            fQName.setValues(null, ename, ename, null);
            if (DEBUG_CALLBACKS) {
               System.out.println("startElement(" + fQName + ',' + fAttributes + ")");
            }
            fEndLineNumber = fCurrentEntity.getLineNumber();
            fEndColumnNumber = fCurrentEntity.getColumnNumber();
            fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
            if (empty[0]) {
               fDocumentHandler.emptyElement(fQName, fAttributes, locationAugs());
            } else {
               fDocumentHandler.startElement(fQName, fAttributes, locationAugs());
            }
         }
         return ename;
      } // scanStartElement():ename

      /**
       * Tries to change the encoding used to read the input stream to the specified one
       *
       * @param charset the charset that should be used
       * @return <code>true</code> when the encoding has been changed
       */
      private boolean changeEncoding(String charset) {
         if (charset == null || fByteStream == null) {
            return false;
         }
         charset = charset.trim();
         boolean encodingChanged = false;
         try {
            String ianaEncoding = charset;
            String javaEncoding = EncodingMap.getIANA2JavaMapping(ianaEncoding.toUpperCase());
            if (DEBUG_CHARSET) {
               System.out.println("+++ ianaEncoding: " + ianaEncoding);
               System.out.println("+++ javaEncoding: " + javaEncoding);
            }
            if (javaEncoding == null) {
               javaEncoding = ianaEncoding;
               if (fReportErrors) {
                  fErrorReporter.reportError("HTML1001", new Object[]{ianaEncoding});
               }
            }
            // patch: Marc Guillemot
            if (!javaEncoding.equals(fJavaEncoding)) {
               if (!isEncodingCompatible(javaEncoding, fJavaEncoding)) {
                  if (fReportErrors) {
                     fErrorReporter.reportError("HTML1015", new Object[]{
                              javaEncoding, fJavaEncoding});
                  }
               }
               // change the charset
               else {
                  fIso8859Encoding = ianaEncoding == null
                           || ianaEncoding.toUpperCase().startsWith("ISO-8859")
                           || ianaEncoding.equalsIgnoreCase(fDefaultIANAEncoding);
                  fJavaEncoding = javaEncoding;
                  fCurrentEntity.setStream(new InputStreamReader(fByteStream, javaEncoding));
                  fByteStream.playback();
                  fElementDepth = fElementCount;
                  fElementCount = 0;
                  encodingChanged = true;
               }
            }
         } catch (UnsupportedEncodingException e) {
            if (fReportErrors) {
               fErrorReporter.reportError("HTML1010", new Object[]{charset});
            }
            // NOTE: If the encoding change doesn't work,
            // then there's no point in continuing to
            // buffer the input stream.
            fByteStream.clear();
            fByteStream = null;
         }
         return encodingChanged;
      }

      /**
       * Returns true if the given element has an end-tag.
       */
      private boolean isEnded(String ename) {
         String content = new String(fCurrentEntity.buffer, fCurrentEntity.offset,
                  fCurrentEntity.length - fCurrentEntity.offset);
         return content.toLowerCase().indexOf("</" + ename.toLowerCase() + ">") != -1;
      }

      /**
       * Removes all spaces for the string (remember: JDK 1.3!)
       */
      private String removeSpaces(final String content) {
         StringBuffer sb = null;
         for (int i = content.length() - 1; i >= 0; --i) {
            if (Character.isWhitespace(content.charAt(i))) {
               if (sb == null) {
                  sb = new StringBuffer(content);
               }
               sb.deleteCharAt(i);
            }
         }
         return (sb == null) ? content : sb.toString();
      }

      private void scanScriptContent() throws IOException {

         final XMLStringBuffer buffer = new XMLStringBuffer();
         boolean waitForEndComment = false;
         while (true) {
            int c = fCurrentEntity.read();
            if (c == -1) {
               break;
            } else if (c == '-' && endsWith(buffer, "<!-")) {
               waitForEndComment = endCommentAvailable();
            } else if (!waitForEndComment && c == '<') {
               final String next = nextContent(8) + " ";
               if (next.length() >= 8 && "/script".equalsIgnoreCase(next.substring(0, 7))
                        && ('>' == next.charAt(7) || Character.isWhitespace(next.charAt(7)))) {
                  fCurrentEntity.rewind();
                  break;
               }
            } else if (c == '>' && endsWith(buffer, "--")) {
               waitForEndComment = false;
            }

            if (c == '\r' || c == '\n') {
               fCurrentEntity.rewind();
               int newlines = skipNewlines();
               for (int i = 0; i < newlines; i++) {
                  buffer.append('\n');
               }
            } else {
               buffer.append((char) c);
            }
         }

         if (fScriptStripCommentDelims) {
            reduceToContent(buffer, "<!--", "-->");
         }
         if (fScriptStripCDATADelims) {
            reduceToContent(buffer, "<![CDATA[", "]]>");
         }

         if (buffer.length > 0 && fDocumentHandler != null && fElementCount >= fElementDepth) {
            if (DEBUG_CALLBACKS) {
               System.out.println("characters(" + buffer + ")");
            }
            fEndLineNumber = fCurrentEntity.getLineNumber();
            fEndColumnNumber = fCurrentEntity.getColumnNumber();
            fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
            fDocumentHandler.characters(buffer, locationAugs());
         }
      }

      //
      // Private methods
      //

      /**
       * Scans the content of <noscript>: it doesn't get parsed but is considered as plain text when
       * feature {@link HTMLScanner#PARSE_NOSCRIPT_CONTENT} is set to false.
       *
       * @param the tag for which content is scanned (one of "noscript", "noframes", "iframe")
       * @throws IOException
       */
      private void scanUntilEndTag(final String tagName) throws IOException {
         final XMLStringBuffer buffer = new XMLStringBuffer();
         final String end = "/" + tagName;
         final int lengthToScan = tagName.length() + 2;

         while (true) {
            int c = fCurrentEntity.read();
            if (c == -1) {
               break;
            }
            if (c == '<') {
               final String next = nextContent(lengthToScan) + " ";
               if (next.length() >= lengthToScan
                        && end.equalsIgnoreCase(next.substring(0, end.length()))
                        && ('>' == next.charAt(lengthToScan - 1) || Character.isWhitespace(next.charAt(lengthToScan - 1)))) {
                  fCurrentEntity.rewind();
                  break;
               }
            }
            if (c == '\r' || c == '\n') {
               fCurrentEntity.rewind();
               int newlines = skipNewlines();
               for (int i = 0; i < newlines; i++) {
                  buffer.append('\n');
               }
            } else {
               buffer.append((char) c);
            }
         }
         if (buffer.length > 0 && fDocumentHandler != null) {
            fEndLineNumber = fCurrentEntity.getLineNumber();
            fEndColumnNumber = fCurrentEntity.getColumnNumber();
            fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
            fDocumentHandler.characters(buffer, locationAugs());
         }
      }

   } // class ContentScanner

   /**
    * Current entity.
    *
    * @author Andy Clark
    */
   public static class CurrentEntity {

      //
      // Data
      //

      /** Base system identifier. */
      public final String baseSystemId;

      /** Character buffer. */
      public char[] buffer = new char[DEFAULT_BUFFER_SIZE];

      /** Character offset in the file. */
      public int characterOffset_ = 0;

      /** Expanded system identifier. */
      public final String expandedSystemId;

      /** Length of characters read into character buffer. */
      public int length = 0;

      /** Literal system identifier. */
      public final String literalSystemId;

      /** Offset into character buffer. */
      public int offset = 0;

      /** Public identifier. */
      public final String publicId;

      /** XML version. */
      public final String version = "1.0";

      /** Column number. */
      private int columnNumber_ = 1;

      // buffer

      /** Encoding. */
      private String encoding;

      private boolean endReached_ = false;

      /** Line number. */
      private int lineNumber_ = 1;

      /** Character stream. */
      private Reader stream_;

      //
      // Constructors
      //

      /** Constructs an entity from the specified stream. */
      public CurrentEntity(Reader stream, String encoding, String publicId, String baseSystemId,
               String literalSystemId, String expandedSystemId) {
         stream_ = stream;
         this.encoding = encoding;
         this.publicId = publicId;
         this.baseSystemId = baseSystemId;
         this.literalSystemId = literalSystemId;
         this.expandedSystemId = expandedSystemId;
      } // <init>(Reader,String,String,String,String)

      public int getLineNumber() {
         return lineNumber_;
      }

      /**
       * Loads a new chunk of data into the buffer and returns the number of characters loaded or -1
       * if no additional characters were loaded.
       *
       * @param offset The offset at which new characters should be loaded.
       */
      protected int load(int offset) throws IOException {
         debugBufferIfNeeded("(load: ");
         // resize buffer, if needed
         if (offset == buffer.length) {
            int adjust = buffer.length / 4;
            char[] array = new char[buffer.length + adjust];
            System.arraycopy(buffer, 0, array, 0, length);
            buffer = array;
         }
         // read a block of characters
         int count = stream_.read(buffer, offset, buffer.length - offset);
         if (count == -1) {
            endReached_ = true;
         }
         length = count != -1 ? count + offset : offset;
         this.offset = offset;
         debugBufferIfNeeded(")load: ", " -> " + count);
         return count;
      } // load():int

      /** Reads a single character. */
      protected int read() throws IOException {
         debugBufferIfNeeded("(read: ");
         if (offset == length) {
            if (endReached_) {
               return -1;
            }
            if (load(0) == -1) {
               if (DEBUG_BUFFER) {
                  System.out.println(")read: -> -1");
               }
               return -1;
            }
         }
         final char c = buffer[offset++];
         characterOffset_++;
         columnNumber_++;

         debugBufferIfNeeded(")read: ", " -> " + c);
         return c;
      } // read():int

      /**
       * Indicates if there are characters left.
       */
      boolean hasNext() {
         return offset < length;
      }

      private void closeQuietly() {
         try {
            stream_.close();
         } catch (IOException e) {
            // ignore
         }
      }

      /** Prints the contents of the character buffer to standard out. */
      private void debugBufferIfNeeded(final String prefix) {
         debugBufferIfNeeded(prefix, "");
      }

      /** Prints the contents of the character buffer to standard out. */
      private void debugBufferIfNeeded(final String prefix, final String suffix) {
         if (DEBUG_BUFFER) {
            System.out.print(prefix);
            System.out.print('[');
            System.out.print(length);
            System.out.print(' ');
            System.out.print(offset);
            if (length > 0) {
               System.out.print(" \"");
               for (int i = 0; i < length; i++) {
                  if (i == offset) {
                     System.out.print('^');
                  }
                  char c = buffer[i];
                  switch (c) {
                     case '\r': {
                        System.out.print("\\r");
                        break;
                     }
                     case '\n': {
                        System.out.print("\\n");
                        break;
                     }
                     case '\t': {
                        System.out.print("\\t");
                        break;
                     }
                     case '"': {
                        System.out.print("\\\"");
                        break;
                     }
                     default: {
                        System.out.print(c);
                     }
                  }
               }
               if (offset == length) {
                  System.out.print('^');
               }
               System.out.print('"');
            }
            System.out.print(']');
            System.out.print(suffix);
            System.out.println();
         }
      } // printBuffer()

      private int getCharacterOffset() {
         return characterOffset_;
      }

      private int getColumnNumber() {
         return columnNumber_;
      }

      private char getCurrentChar() {
         return buffer[offset];
      }

      /**
       * Gets the current character and moves to next one.
       *
       * @return
       */
      private char getNextChar() {
         characterOffset_++;
         columnNumber_++;
         return buffer[offset++];
      }

      private void incLine() {
         lineNumber_++;
         columnNumber_ = 1;
      }

      private void incLine(int nbLines) {
         lineNumber_ += nbLines;
         columnNumber_ = 1;
      }

      private void resetBuffer(final XMLStringBuffer buffer, final int lineNumber,
               final int columnNumber, final int characterOffset) {
         lineNumber_ = lineNumber;
         columnNumber_ = columnNumber;
         this.characterOffset_ = characterOffset;
         this.buffer = buffer.ch;
         this.offset = buffer.offset;
         this.length = buffer.length;
      }

      private void restorePosition(int originalOffset, int originalColumnNumber,
               int originalCharacterOffset) {
         this.offset = originalOffset;
         this.columnNumber_ = originalColumnNumber;
         this.characterOffset_ = originalCharacterOffset;
      }

      /**
       * Goes back, cancelling the effect of the previous read() call.
       */
      private void rewind() {
         offset--;
         characterOffset_--;
         columnNumber_--;
      }

      private void rewind(int i) {
         offset -= i;
         characterOffset_ -= i;
         columnNumber_ -= i;
      }

      private void setStream(final InputStreamReader inputStreamReader) {
         stream_ = inputStreamReader;
         offset = length = characterOffset_ = 0;
         lineNumber_ = columnNumber_ = 1;
         encoding = inputStreamReader.getEncoding();
      }
   } // class CurrentEntity

   // doctype info: HTML 4.01 loose

   /**
    * A playback input stream. This class has the ability to save the bytes read from the underlying
    * input stream and play the bytes back later. This class is used by the HTML scanner to switch
    * encodings when a &lt;meta&gt; tag is detected that specifies a different encoding.
    * <p>
    * If the encoding is changed, then the scanner calls the <code>playback</code> method and
    * re-scans the beginning of the HTML document again. This should not be too much of a
    * performance problem because the &lt;meta&gt; tag appears at the beginning of the document.
    * <p>
    * If the &lt;body&gt; tag is reached without playing back the bytes, then the buffer can be
    * cleared by calling the <code>clear</code> method. This stops the buffering of bytes and allows
    * the memory used by the buffer to be reclaimed.
    * <p>
    * <strong>Note:</strong> If the buffer is never played back or cleared, this input stream will
    * continue to buffer the entire stream. Therefore, it is very important to use this stream
    * correctly.
    *
    * @author Andy Clark
    */
   public static class PlaybackInputStream extends FilterInputStream {

      //
      // Constants
      //

      /** Set to true to debug playback. */
      private static final boolean DEBUG_PLAYBACK = false;

      //
      // Data
      //

      // state

      /** Pushback length. */
      public int fPushbackLength = 0;

      /** Pushback offset. */
      public int fPushbackOffset = 0;

      /** Byte buffer. */
      protected byte[] fByteBuffer = new byte[1024];

      // buffer info

      /** Length of bytes read into byte buffer. */
      protected int fByteLength = 0;

      /** Offset into byte buffer during playback. */
      protected int fByteOffset = 0;

      /** Buffer cleared. */
      protected boolean fCleared = false;

      /** Encoding detected. */
      protected boolean fDetected = false;

      /** Playback mode. */
      protected boolean fPlayback = false;

      //
      // Constructors
      //

      /** Constructor. */
      public PlaybackInputStream(InputStream in) {
         super(in);
      } // <init>(InputStream)

      //
      // Public methods
      //

      /**
       * Clears the buffer.
       * <p>
       * <strong>Note:</strong> The buffer cannot be cleared during playback. Therefore, calling
       * this method during playback will not do anything. However, the buffer will be cleared
       * automatically at the end of playback.
       */
      public void clear() {
         if (!fPlayback) {
            fCleared = true;
            fByteBuffer = null;
         }
      } // clear()

      /** Detect encoding. */
      public void detectEncoding(String[] encodings) throws IOException {
         if (fDetected) {
            throw new IOException("Should not detect encoding twice.");
         }
         fDetected = true;
         int b1 = read();
         if (b1 == -1) {
            return;
         }
         int b2 = read();
         if (b2 == -1) {
            fPushbackLength = 1;
            return;
         }
         // UTF-8 BOM: 0xEFBBBF
         if (b1 == 0xEF && b2 == 0xBB) {
            int b3 = read();
            if (b3 == 0xBF) {
               fPushbackOffset = 3;
               encodings[0] = "UTF-8";
               encodings[1] = "UTF8";
               return;
            }
            fPushbackLength = 3;
         }
         // UTF-16 LE BOM: 0xFFFE
         if (b1 == 0xFF && b2 == 0xFE) {
            encodings[0] = "UTF-16";
            encodings[1] = "UnicodeLittleUnmarked";
            return;
         }
         // UTF-16 BE BOM: 0xFEFF
         else if (b1 == 0xFE && b2 == 0xFF) {
            encodings[0] = "UTF-16";
            encodings[1] = "UnicodeBigUnmarked";
            return;
         }
         // unknown
         fPushbackLength = 2;
      } // detectEncoding()

      /** Playback buffer contents. */
      public void playback() {
         fPlayback = true;
      } // playback()

      //
      // InputStream methods
      //

      /** Read a byte. */
      public int read() throws IOException {
         if (DEBUG_PLAYBACK) {
            System.out.println("(read");
         }
         if (fPushbackOffset < fPushbackLength) {
            return fByteBuffer[fPushbackOffset++];
         }
         if (fCleared) {
            return in.read();
         }
         if (fPlayback) {
            int c = fByteBuffer[fByteOffset++];
            if (fByteOffset == fByteLength) {
               fCleared = true;
               fByteBuffer = null;
            }
            if (DEBUG_PLAYBACK) {
               System.out.println(")read -> " + (char) c);
            }
            return c;
         }
         int c = in.read();
         if (c != -1) {
            if (fByteLength == fByteBuffer.length) {
               byte[] newarray = new byte[fByteLength + 1024];
               System.arraycopy(fByteBuffer, 0, newarray, 0, fByteLength);
               fByteBuffer = newarray;
            }
            fByteBuffer[fByteLength++] = (byte) c;
         }
         if (DEBUG_PLAYBACK) {
            System.out.println(")read -> " + (char) c);
         }
         return c;
      } // read():int

      /** Read an array of bytes. */
      public int read(byte[] array) throws IOException {
         return read(array, 0, array.length);
      } // read(byte[]):int

      /** Read an array of bytes. */
      public int read(byte[] array, int offset, int length) throws IOException {
         if (DEBUG_PLAYBACK) {
            System.out.println(")read(" + offset + ',' + length + ')');
         }
         if (fPushbackOffset < fPushbackLength) {
            int count = fPushbackLength - fPushbackOffset;
            if (count > length) {
               count = length;
            }
            System.arraycopy(fByteBuffer, fPushbackOffset, array, offset, count);
            fPushbackOffset += count;
            return count;
         }
         if (fCleared) {
            return in.read(array, offset, length);
         }
         if (fPlayback) {
            if (fByteOffset + length > fByteLength) {
               length = fByteLength - fByteOffset;
            }
            System.arraycopy(fByteBuffer, fByteOffset, array, offset, length);
            fByteOffset += length;
            if (fByteOffset == fByteLength) {
               fCleared = true;
               fByteBuffer = null;
            }
            return length;
         }
         int count = in.read(array, offset, length);
         if (count != -1) {
            if (fByteLength + count > fByteBuffer.length) {
               byte[] newarray = new byte[fByteLength + count + 512];
               System.arraycopy(fByteBuffer, 0, newarray, 0, fByteLength);
               fByteBuffer = newarray;
            }
            System.arraycopy(array, offset, fByteBuffer, fByteLength, count);
            fByteLength += count;
         }
         if (DEBUG_PLAYBACK) {
            System.out.println(")read(" + offset + ',' + length + ") -> " + count);
         }
         return count;
      } // read(byte[]):int

   } // class PlaybackInputStream

   /**
    * Basic scanner interface.
    *
    * @author Andy Clark
    */
   public interface Scanner {

      //
      // Scanner methods
      //

      /**
       * Scans part of the document. This interface allows scanning to be performed in a pulling
       * manner.
       *
       * @param complete True if the scanner should not return until scanning is complete.
       *
       * @return True if additional scanning is required.
       *
       * @throws IOException Thrown if I/O error occurs.
       */
      public boolean scan(boolean complete) throws IOException;

   } // interface Scanner

   // doctype info: HTML 4.01 frameset

   /**
    * Special scanner used for elements whose content needs to be scanned as plain text, ignoring
    * markup such as elements and entity references. For example: &lt;SCRIPT&gt; and
    * &lt;COMMENT&gt;.
    *
    * @author Andy Clark
    */
   public class SpecialScanner implements Scanner {

      //
      // Data
      //

      /** Name of element whose content needs to be scanned as text. */
      protected String fElementName;

      /** True if &lt;style&gt; element. */
      protected boolean fStyle;

      /** True if &lt;textarea&gt; element. */
      protected boolean fTextarea;

      /** True if &lt;title&gt; element. */
      protected boolean fTitle;

      // temp vars

      /** A qualified name. */
      private final QName fQName = new QName();

      /** A string buffer. */
      private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();

      //
      // Public methods
      //

      /** Scan. */
      public boolean scan(boolean complete) throws IOException {
         boolean next;
         do {
            try {
               next = false;
               switch (fScannerState) {
                  case STATE_CONTENT: {
                     fBeginLineNumber = fCurrentEntity.getLineNumber();
                     fBeginColumnNumber = fCurrentEntity.getColumnNumber();
                     fBeginCharacterOffset = fCurrentEntity.getCharacterOffset();
                     int c = fCurrentEntity.read();
                     if (c == '<') {
                        setScannerState(STATE_MARKUP_BRACKET);
                        continue;
                     }
                     if (c == '&') {
                        if (fTextarea || fTitle) {
                           scanEntityRef(fStringBuffer, true);
                           continue;
                        }
                        fStringBuffer.clear();
                        fStringBuffer.append('&');
                     } else if (c == -1) {
                        if (fReportErrors) {
                           fErrorReporter.reportError("HTML1007", null);
                        }
                        throw new EOFException();
                     } else {
                        fCurrentEntity.rewind();
                        fStringBuffer.clear();
                     }
                     scanCharacters(fStringBuffer, -1);
                     break;
                  } // case STATE_CONTENT
                  case STATE_MARKUP_BRACKET: {
                     int delimiter = -1;
                     int c = fCurrentEntity.read();
                     if (c == '/') {
                        String ename = scanName();
                        if (ename != null) {
                           if (ename.equalsIgnoreCase(fElementName)) {
                              if (fCurrentEntity.read() == '>') {
                                 ename = modifyName(ename, fNamesElems);
                                 if (fDocumentHandler != null && fElementCount >= fElementDepth) {
                                    fQName.setValues(null, ename, ename, null);
                                    if (DEBUG_CALLBACKS) {
                                       System.out.println("endElement(" + fQName + ")");
                                    }
                                    fEndLineNumber = fCurrentEntity.getLineNumber();
                                    fEndColumnNumber = fCurrentEntity.getColumnNumber();
                                    fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
                                    fDocumentHandler.endElement(fQName, locationAugs());
                                 }
                                 setScanner(fContentScanner);
                                 setScannerState(STATE_CONTENT);
                                 return true;
                              } else {
                                 fCurrentEntity.rewind();
                              }
                           }
                           fStringBuffer.clear();
                           fStringBuffer.append("</");
                           fStringBuffer.append(ename);
                        } else {
                           fStringBuffer.clear();
                           fStringBuffer.append("</");
                        }
                     } else {
                        fStringBuffer.clear();
                        fStringBuffer.append('<');
                        fStringBuffer.append((char) c);
                     }
                     scanCharacters(fStringBuffer, delimiter);
                     setScannerState(STATE_CONTENT);
                     break;
                  } // case STATE_MARKUP_BRACKET
               } // switch
            } // try
            catch (EOFException e) {
               setScanner(fContentScanner);
               if (fCurrentEntityStack.empty()) {
                  setScannerState(STATE_END_DOCUMENT);
               } else {
                  fCurrentEntity = (CurrentEntity) fCurrentEntityStack.pop();
                  setScannerState(STATE_CONTENT);
               }
               return true;
            }
         } // do
         while (next || complete);
         return true;
      } // scan(boolean):boolean

      //
      // Scanner methods
      //

      /** Sets the element name. */
      public Scanner setElementName(String ename) {
         fElementName = ename;
         fStyle = fElementName.equalsIgnoreCase("STYLE");
         fTextarea = fElementName.equalsIgnoreCase("TEXTAREA");
         fTitle = fElementName.equalsIgnoreCase("TITLE");
         return this;
      } // setElementName(String):Scanner

      //
      // Protected methods
      //

      /** Scan characters. */
      protected void scanCharacters(XMLStringBuffer buffer, int delimiter) throws IOException {
         fCurrentEntity.debugBufferIfNeeded("(scanCharacters, delimiter=" + delimiter + ": ");

         while (true) {
            int c = fCurrentEntity.read();

            if (c == -1 || (c == '<' || c == '&')) {
               if (c != -1) {
                  fCurrentEntity.rewind();
               }
               break;
            }
            // Patch supplied by Jonathan Baxter
            else if (c == '\r' || c == '\n') {
               fCurrentEntity.rewind();
               int newlines = skipNewlines();
               for (int i = 0; i < newlines; i++) {
                  buffer.append('\n');
               }
            } else {
               buffer.append((char) c);
               if (c == '\n') {
                  fCurrentEntity.incLine();
               }
            }
         }

         if (fStyle) {
            if (fStyleStripCommentDelims) {
               reduceToContent(buffer, "<!--", "-->");
            }
            if (fStyleStripCDATADelims) {
               reduceToContent(buffer, "<![CDATA[", "]]>");
            }
         }

         if (buffer.length > 0 && fDocumentHandler != null && fElementCount >= fElementDepth) {
            if (DEBUG_CALLBACKS) {
               System.out.println("characters(" + buffer + ")");
            }
            fEndLineNumber = fCurrentEntity.getLineNumber();
            fEndColumnNumber = fCurrentEntity.getColumnNumber();
            fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
            fDocumentHandler.characters(buffer, locationAugs());
         }
         fCurrentEntity.debugBufferIfNeeded(")scanCharacters: ");
      } // scanCharacters(StringBuffer)
   } // class SpecialScanner

   /**
    * Location infoset item.
    *
    * @author Andy Clark
    */
   protected static class LocationItem implements HTMLEventInfo, Cloneable {

      //
      // Data
      //

      /** Beginning character offset. */
      protected int fBeginCharacterOffset;

      /** Beginning column number. */
      protected int fBeginColumnNumber;

      /** Beginning line number. */
      protected int fBeginLineNumber;

      /** Ending character offset. */
      protected int fEndCharacterOffset;

      /** Ending column number. */
      protected int fEndColumnNumber;

      /** Ending line number. */
      protected int fEndLineNumber;

      //
      // Public methods
      //
      public LocationItem() {
         // nothing
      }

      LocationItem(final LocationItem other) {
         setValues(other.fBeginLineNumber, other.fBeginColumnNumber, other.fBeginCharacterOffset,
                  other.fEndLineNumber, other.fEndColumnNumber, other.fEndCharacterOffset);
      }

      /** Returns the character offset of the beginning of this event. */
      public int getBeginCharacterOffset() {
         return fBeginCharacterOffset;
      } // getBeginCharacterOffset():int

      //
      // HTMLEventInfo methods
      //

      // location information

      /** Returns the column number of the beginning of this event. */
      public int getBeginColumnNumber() {
         return fBeginColumnNumber;
      } // getBeginColumnNumber():int

      /** Returns the line number of the beginning of this event. */
      public int getBeginLineNumber() {
         return fBeginLineNumber;
      } // getBeginLineNumber():int

      /** Returns the character offset of the end of this event. */
      public int getEndCharacterOffset() {
         return fEndCharacterOffset;
      } // getEndCharacterOffset():int

      /** Returns the column number of the end of this event. */
      public int getEndColumnNumber() {
         return fEndColumnNumber;
      } // getEndColumnNumber():int

      /** Returns the line number of the end of this event. */
      public int getEndLineNumber() {
         return fEndLineNumber;
      } // getEndLineNumber():int

      /** Returns true if this corresponding event was synthesized. */
      public boolean isSynthesized() {
         return false;
      } // isSynthesize():boolean

      // other information

      /** Sets the values of this item. */
      public void setValues(int beginLine, int beginColumn, int beginOffset, int endLine,
               int endColumn, int endOffset) {
         fBeginLineNumber = beginLine;
         fBeginColumnNumber = beginColumn;
         fBeginCharacterOffset = beginOffset;
         fEndLineNumber = endLine;
         fEndColumnNumber = endColumn;
         fEndCharacterOffset = endOffset;
      } // setValues(int,int,int,int)

      //
      // Object methods
      //

      /** Returns a string representation of this object. */
      public String toString() {
         StringBuffer str = new StringBuffer();
         str.append(fBeginLineNumber);
         str.append(':');
         str.append(fBeginColumnNumber);
         str.append(':');
         str.append(fBeginCharacterOffset);
         str.append(':');
         str.append(fEndLineNumber);
         str.append(':');
         str.append(fEndColumnNumber);
         str.append(':');
         str.append(fEndCharacterOffset);
         return str.toString();
      } // toString():String

   } // class LocationItem

   // features

   /** Allows self closing &lt;iframe/&gt; tag */
   public static final String ALLOW_SELFCLOSING_IFRAME = "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe";

   /** Scan CDATA sections. */
   public static final String CDATA_SECTIONS = "http://cyberneko.org/html/features/scanner/cdata-sections";

   /** Fix Microsoft Windows&reg; character entity references. */
   public static final String FIX_MSWINDOWS_REFS = "http://cyberneko.org/html/features/scanner/fix-mswindows-refs";

   /**
    * HTML 4.01 frameset public identifier ("-//W3C//DTD HTML 4.01 Frameset//EN").
    */
   public static final String HTML_4_01_FRAMESET_PUBID = "-//W3C//DTD HTML 4.01 Frameset//EN";

   /**
    * HTML 4.01 frameset system identifier ("http://www.w3.org/TR/html4/frameset.dtd").
    */
   public static final String HTML_4_01_FRAMESET_SYSID = "http://www.w3.org/TR/html4/frameset.dtd";

   /** HTML 4.01 strict public identifier ("-//W3C//DTD HTML 4.01//EN"). */
   public static final String HTML_4_01_STRICT_PUBID = "-//W3C//DTD HTML 4.01//EN";

   /**
    * HTML 4.01 strict system identifier ("http://www.w3.org/TR/html4/strict.dtd").
    */
   public static final String HTML_4_01_STRICT_SYSID = "http://www.w3.org/TR/html4/strict.dtd";

   /**
    * HTML 4.01 transitional public identifier ("-//W3C//DTD HTML 4.01 Transitional//EN").
    */
   public static final String HTML_4_01_TRANSITIONAL_PUBID = "-//W3C//DTD HTML 4.01 Transitional//EN";

   /**
    * HTML 4.01 transitional system identifier ("http://www.w3.org/TR/html4/loose.dtd").
    */
   public static final String HTML_4_01_TRANSITIONAL_SYSID = "http://www.w3.org/TR/html4/loose.dtd";

   /**
    * Ignore specified charset found in the &lt;meta equiv='Content-Type'
    * content='text/html;charset=&hellip;'&gt; tag or in the &lt;?xml &hellip;
    * encoding='&hellip;'&gt; processing instruction
    */
   public static final String IGNORE_SPECIFIED_CHARSET = "http://cyberneko.org/html/features/scanner/ignore-specified-charset";

   /** Insert document type declaration. */
   public static final String INSERT_DOCTYPE = "http://cyberneko.org/html/features/insert-doctype";

   /** Notify character entity references (e.g. &amp;#32;, &amp;#x20;, etc). */
   public static final String NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs";

   /**
    * Notify handler of built-in entity references (e.g. &amp;nobr;, &amp;copy;, etc).
    * <p>
    * <strong>Note:</strong> This <em>includes</em> the five pre-defined XML general entities.
    */
   public static final String NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs";

   /**
    * Notify handler of built-in entity references (e.g. &amp;amp;, &amp;lt;, etc).
    * <p>
    * <strong>Note:</strong> This only applies to the five pre-defined XML general entities.
    * Specifically, "amp", "lt", "gt", "quot", and "apos". This is done for compatibility with the
    * Xerces feature.
    * <p>
    * To be notified of the built-in entity references in HTML, set the
    * <code>http://cyberneko.org/html/features/scanner/notify-builtin-refs</code> feature to
    * <code>true</code>.
    */
   public static final String NOTIFY_XML_BUILTIN_REFS = "http://apache.org/xml/features/scanner/notify-builtin-refs";

   /** Override doctype declaration public and system identifiers. */
   public static final String OVERRIDE_DOCTYPE = "http://cyberneko.org/html/features/override-doctype";

   /** Parse &lt;noscript&gt;...&lt;/noscript&gt; content */
   public static final String PARSE_NOSCRIPT_CONTENT = "http://cyberneko.org/html/features/parse-noscript-content";

   /**
    * Strip XHTML CDATA delimiters ("&lt;![CDATA[" and "]]&gt;") from SCRIPT tag contents.
    */
   public static final String SCRIPT_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-cdata-delims";

   /**
    * Strip HTML comment delimiters ("&lt;!&minus;&minus;" and "&minus;&minus;&gt;") from SCRIPT tag
    * contents.
    */
   public static final String SCRIPT_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-comment-delims";

   /**
    * Strip XHTML CDATA delimiters ("&lt;![CDATA[" and "]]&gt;") from STYLE tag contents.
    */
   public static final String STYLE_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims";

   // properties

   /**
    * Strip HTML comment delimiters ("&lt;!&minus;&minus;" and "&minus;&minus;&gt;") from STYLE tag
    * contents.
    */
   public static final String STYLE_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-comment-delims";

   /** Include infoset augmentations. */
   protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";

   /** Set to true to debug callbacks. */
   protected static final boolean DEBUG_CALLBACKS = false;

   /** Default buffer size. */
   protected static final int DEFAULT_BUFFER_SIZE = 2048;

   /** Default encoding. */
   protected static final String DEFAULT_ENCODING = "http://cyberneko.org/html/properties/default-encoding";

   /** Doctype declaration public identifier. */
   protected static final String DOCTYPE_PUBID = "http://cyberneko.org/html/properties/doctype/pubid";

   /** Doctype declaration system identifier. */
   protected static final String DOCTYPE_SYSID = "http://cyberneko.org/html/properties/doctype/sysid";

   /** Error reporter. */
   protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";

   // states

   /** Modify HTML attribute names: { "upper", "lower", "default" }. */
   protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";

   /** Modify HTML element names: { "upper", "lower", "default" }. */
   protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";

   /** Lowercase HTML names. */
   protected static final short NAMES_LOWERCASE = 2;

   /** Don't modify HTML names. */
   protected static final short NAMES_NO_CHANGE = 0;

   // modify HTML names

   /** Uppercase HTML names. */
   protected static final short NAMES_UPPERCASE = 1;

   /** Normalize attribute values. */
   protected static final String NORMALIZE_ATTRIBUTES = "http://cyberneko.org/html/features/scanner/normalize-attrs";

   /** Report errors. */
   protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";

   // defaults

   /** State: content. */
   protected static final short STATE_CONTENT = 0;

   // debugging

   /** State: end document. */
   protected static final short STATE_END_DOCUMENT = 11;

   /** State: markup bracket. */
   protected static final short STATE_MARKUP_BRACKET = 1;

   /** State: start document. */
   protected static final short STATE_START_DOCUMENT = 10;

   /** Synthesized event info item. */
   protected static final HTMLEventInfo SYNTHESIZED_ITEM = new HTMLEventInfo.SynthesizedItem();

   /** Set to true to debug the buffer. */
   private static final boolean DEBUG_BUFFER = false;

   // static vars

   /** Set to true to debug character encoding handling. */
   private static final boolean DEBUG_CHARSET = false;

   /** Set to true to debug changes in the scanner. */
   private static final boolean DEBUG_SCANNER = false;
   /** Set to true to debug changes in the scanner state. */
   private static final boolean DEBUG_SCANNER_STATE = false;

   // features

   private final static BitSet ENTITY_CHARS = new BitSet();

   /** Recognized features. */
   private static final String[] RECOGNIZED_FEATURES = {
            AUGMENTATIONS, REPORT_ERRORS, NOTIFY_CHAR_REFS, NOTIFY_XML_BUILTIN_REFS,
            NOTIFY_HTML_BUILTIN_REFS, FIX_MSWINDOWS_REFS, SCRIPT_STRIP_CDATA_DELIMS,
            SCRIPT_STRIP_COMMENT_DELIMS, STYLE_STRIP_CDATA_DELIMS, STYLE_STRIP_COMMENT_DELIMS,
            IGNORE_SPECIFIED_CHARSET, CDATA_SECTIONS, OVERRIDE_DOCTYPE, INSERT_DOCTYPE,
            NORMALIZE_ATTRIBUTES, PARSE_NOSCRIPT_CONTENT, ALLOW_SELFCLOSING_IFRAME,};

   /** Recognized features defaults. */
   private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
            null, null, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE,
            Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE,
            Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.TRUE, Boolean.FALSE,};

   /** Recognized properties. */
   private static final String[] RECOGNIZED_PROPERTIES = {
            NAMES_ELEMS, NAMES_ATTRS, DEFAULT_ENCODING, ERROR_REPORTER, DOCTYPE_PUBID,
            DOCTYPE_SYSID,};

   /** Recognized properties defaults. */
   private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {
            null, null, "Windows-1252", null, HTML_4_01_TRANSITIONAL_PUBID,
            HTML_4_01_TRANSITIONAL_SYSID,};

   static {
      final String str = "-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz";
      for (int i = 0; i < str.length(); ++i) {
         char c = str.charAt(i);
         ENTITY_CHARS.set(c);
      }
   }

   //
   // Data
   //

   /**
    * Expands a system id and returns the system id as a URI, if it can be expanded. A return value
    * of null means that the identifier is already expanded. An exception thrown indicates a failure
    * to expand the id.
    *
    * @param systemId The systemId to be expanded.
    *
    * @return Returns the URI string representing the expanded system identifier. A null value
    *         indicates that the given system identifier is already expanded.
    *
    */
   public static String expandSystemId(String systemId, String baseSystemId) {

      // check for bad parameters id
      if (systemId == null || systemId.length() == 0) {
         return systemId;
      }
      // if id already expanded, return
      try {
         URI uri = new URI(systemId);
         if (uri != null) {
            return systemId;
         }
      } catch (URI.MalformedURIException e) {
         // continue on...
      }
      // normalize id
      String id = fixURI(systemId);

      // normalize base
      URI base = null;
      URI uri = null;
      try {
         if (baseSystemId == null || baseSystemId.length() == 0 || baseSystemId.equals(systemId)) {
            String dir;
            try {
               dir = fixURI(System.getProperty("user.dir"));
            } catch (SecurityException se) {
               dir = "";
            }
            if (!dir.endsWith("/")) {
               dir = dir + "/";
            }
            base = new URI("file", "", dir, null, null);
         } else {
            try {
               base = new URI(fixURI(baseSystemId));
            } catch (URI.MalformedURIException e) {
               String dir;
               try {
                  dir = fixURI(System.getProperty("user.dir"));
               } catch (SecurityException se) {
                  dir = "";
               }
               if (baseSystemId.indexOf(':') != -1) {
                  // for xml schemas we might have baseURI with
                  // a specified drive
                  base = new URI("file", "", fixURI(baseSystemId), null, null);
               } else {
                  if (!dir.endsWith("/")) {
                     dir = dir + "/";
                  }
                  dir = dir + fixURI(baseSystemId);
                  base = new URI("file", "", dir, null, null);
               }
            }
         }
         // expand id
         uri = new URI(base, id);
      } catch (URI.MalformedURIException e) {
         // let it go through
      }

      if (uri == null) {
         return systemId;
      }
      return uri.toString();

   } // expandSystemId(String,String):String

   /** Returns true if the name is a built-in XML general entity reference. */
   protected static boolean builtinXmlRef(String name) {
      return name.equals("amp") || name.equals("lt") || name.equals("gt") || name.equals("quot")
               || name.equals("apos");
   } // builtinXmlRef(String):boolean

   /**
    * Fixes a platform dependent filename to standard URI form.
    *
    * @param str The string to fix.
    *
    * @return Returns the fixed URI string.
    */
   protected static String fixURI(String str) {

      // handle platform dependent strings
      str = str.replace(java.io.File.separatorChar, '/');

      // Windows fix
      if (str.length() >= 2) {
         char ch1 = str.charAt(1);
         // change "C:blah" to "/C:blah"
         if (ch1 == ':') {
            char ch0 = Character.toUpperCase(str.charAt(0));
            if (ch0 >= 'A' && ch0 <= 'Z') {
               str = "/" + str;
            }
         }
         // change "//blah" to "file://blah"
         else if (ch1 == '/' && str.charAt(0) == '/') {
            str = "file:" + str;
         }
      }

      // done
      return str;

   } // fixURI(String):String

   /**
    * Converts HTML names string value to constant value.
    *
    * @see #NAMES_NO_CHANGE
    * @see #NAMES_LOWERCASE
    * @see #NAMES_UPPERCASE
    */
   protected static final short getNamesValue(String value) {
      if (value.equals("lower")) {
         return NAMES_LOWERCASE;
      }
      if (value.equals("upper")) {
         return NAMES_UPPERCASE;
      }
      return NAMES_NO_CHANGE;
   } // getNamesValue(String):short

   /** Returns the value of the specified attribute, ignoring case. */
   protected static String getValue(XMLAttributes attrs, String aname) {
      int length = attrs != null ? attrs.getLength() : 0;
      for (int i = 0; i < length; i++) {
         if (attrs.getQName(i).equalsIgnoreCase(aname)) {
            return attrs.getValue(i);
         }
      }
      return null;
   } // getValue(XMLAttributes,String):String

   /** Modifies the given name based on the specified mode. */
   protected static final String modifyName(String name, short mode) {
      switch (mode) {
         case NAMES_UPPERCASE:
            return name.toUpperCase();
         case NAMES_LOWERCASE:
            return name.toLowerCase();
      }
      return name;
   } // modifyName(String,short):String

   /**
    * Reduces the buffer to the content between start and end marker when only whitespaces are found
    * before the startMarker as well as after the end marker
    */
   static void reduceToContent(final XMLStringBuffer buffer, final String startMarker,
            final String endMarker) {
      int i = 0;
      int startContent = -1;
      final int l1 = startMarker.length();
      final int l2 = endMarker.length();
      while (i < buffer.length - l1 - l2) {
         final char c = buffer.ch[buffer.offset + i];
         if (Character.isWhitespace(c)) {
            ++i;
         } else if (c == startMarker.charAt(0)
                  && startMarker.equals(new String(buffer.ch, buffer.offset + i, l1))) {
            startContent = buffer.offset + i + l1;
            break;
         } else {
            return; // start marker not found
         }
      }
      if (startContent == -1) { // start marker not found
         return;
      }

      i = buffer.length - 1;
      while (i > startContent + l2) {
         final char c = buffer.ch[buffer.offset + i];
         if (Character.isWhitespace(c)) {
            --i;
         } else if (c == endMarker.charAt(l2 - 1)
                  && endMarker.equals(new String(buffer.ch, buffer.offset + i - l2 + 1, l2))) {

            buffer.length = buffer.offset + i - startContent - 2;
            buffer.offset = startContent;
            return;
         } else {
            return; // start marker not found
         }
      }
   }

   /** Allows self closing iframe tags. */
   protected boolean fAllowSelfclosingIframe;

   /** Augmentations. */
   protected boolean fAugmentations;

   /** Beginning character offset in the file. */
   protected int fBeginCharacterOffset;

   /** Beginning column number. */
   protected int fBeginColumnNumber;

   /** Beginning line number. */
   protected int fBeginLineNumber;

   // properties

   /** The playback byte stream. */
   protected PlaybackInputStream fByteStream;

   /** CDATA sections. */
   protected boolean fCDATASections;

   /** Content scanner. */
   protected Scanner fContentScanner = new ContentScanner();

   /** Current entity. */
   protected CurrentEntity fCurrentEntity;

   /** The current entity stack. */
   protected final Stack fCurrentEntityStack = new Stack();

   /** Default encoding. */
   protected String fDefaultIANAEncoding;

   // boundary locator information

   /** Doctype declaration public identifier. */
   protected String fDoctypePubid;

   /** Doctype declaration system identifier. */
   protected String fDoctypeSysid;

   /** The document handler. */
   protected XMLDocumentHandler fDocumentHandler;

   /** Element count. */
   protected int fElementCount;

   /** Element depth. */
   protected int fElementDepth;

   /** Ending character offset in the file. */
   protected int fEndCharacterOffset;

   // state

   /** Ending column number. */
   protected int fEndColumnNumber;

   /** Ending line number. */
   protected int fEndLineNumber;

   /** Error reporter. */
   protected HTMLErrorReporter fErrorReporter;

   /** Fix Microsoft Windows&reg; character entity references. */
   protected boolean fFixWindowsCharRefs;

   /** Auto-detected IANA encoding. */
   protected String fIANAEncoding;

   /** Ignore specified character set. */
   protected boolean fIgnoreSpecifiedCharset;

   /** Insert document type declaration. */
   protected boolean fInsertDoctype;

   /** True if the encoding matches "ISO-8859-*". */
   protected boolean fIso8859Encoding;

   /** Auto-detected Java encoding. */
   protected String fJavaEncoding;

   /** Modify HTML attribute names. */
   protected short fNamesAttrs;

   /** Modify HTML element names. */
   protected short fNamesElems;

   // scanners

   /** Normalize attribute values. */
   protected boolean fNormalizeAttributes;

   /** Notify character entity references. */
   protected boolean fNotifyCharRefs;

   // temp vars

   /** Notify HTML built-in general entity references. */
   protected boolean fNotifyHtmlBuiltinRefs;

   /** Notify XML built-in general entity references. */
   protected boolean fNotifyXmlBuiltinRefs;

   /** Override doctype declaration public and system identifiers. */
   protected boolean fOverrideDoctype;

   /** Parse noframes content. */
   protected boolean fParseNoFramesContent;

   /** Parse noscript content. */
   protected boolean fParseNoScriptContent;

   /** Report errors. */
   protected boolean fReportErrors;

   /** The current scanner. */
   protected Scanner fScanner;

   //
   // Public methods
   //

   /** The current scanner state. */
   protected short fScannerState;

   /** Strip CDATA delimiters from SCRIPT tags. */
   protected boolean fScriptStripCDATADelims;

   /** Strip comment delimiters from SCRIPT tags. */
   protected boolean fScriptStripCommentDelims;

   /**
    * Special scanner used for elements whose content needs to be scanned as plain text, ignoring
    * markup such as elements and entity references. For example: &lt;SCRIPT&gt; and
    * &lt;COMMENT&gt;.
    */
   protected SpecialScanner fSpecialScanner = new SpecialScanner();

   //
   // XMLLocator methods
   //

   /** String buffer. */
   protected final XMLStringBuffer fStringBuffer = new XMLStringBuffer(1024);

   /** Strip CDATA delimiters from STYLE tags. */
   protected boolean fStyleStripCDATADelims;

   /** Strip comment delimiters from STYLE tags. */
   protected boolean fStyleStripCommentDelims;

   /** Augmentations. */
   private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();

   /** Location infoset item. */
   private final LocationItem fLocationItem = new LocationItem();

   /** Non-normalized attribute string buffer. */
   private final XMLStringBuffer fNonNormAttr = new XMLStringBuffer(128);

   /** Resource identifier. */
   private final XMLResourceIdentifierImpl fResourceId = new XMLResourceIdentifierImpl();

   /** Single boolean array. */
   private final boolean[] fSingleBoolean = {false};

   /** String buffer. */
   private final XMLStringBuffer fStringBuffer2 = new XMLStringBuffer(1024);

   //
   // HTMLComponent methods
   //

   /**
    * Cleans up used resources. For example, if scanning is terminated early, then this method
    * ensures all remaining open streams are closed.
    *
    * @param closeall Close all streams, including the original. This is used in cases when the
    *           application has opened the original document stream and should be responsible for
    *           closing it.
    */
   public void cleanup(boolean closeall) {
      int size = fCurrentEntityStack.size();
      if (size > 0) {
         // current entity is not the original, so close it
         if (fCurrentEntity != null) {
            fCurrentEntity.closeQuietly();
         }
         // close remaining streams
         for (int i = closeall ? 0 : 1; i < size; i++) {
            fCurrentEntity = (CurrentEntity) fCurrentEntityStack.pop();
            fCurrentEntity.closeQuietly();
         }
      } else if (closeall && fCurrentEntity != null) {
         fCurrentEntity.closeQuietly();
      }
   } // cleanup(boolean)

   /**
    * Immediately evaluates an input source and add the new content (e.g. the output written by an
    * embedded script).
    *
    * @param inputSource The new input source to start evaluating.
    * @see #pushInputSource(XMLInputSource)
    */
   public void evaluateInputSource(XMLInputSource inputSource) {
      final Scanner previousScanner = fScanner;
      final short previousScannerState = fScannerState;
      final CurrentEntity previousEntity = fCurrentEntity;
      final Reader reader = getReader(inputSource);

      String encoding = inputSource.getEncoding();
      String publicId = inputSource.getPublicId();
      String baseSystemId = inputSource.getBaseSystemId();
      String literalSystemId = inputSource.getSystemId();
      String expandedSystemId = expandSystemId(literalSystemId, baseSystemId);
      fCurrentEntity = new CurrentEntity(reader, encoding, publicId, baseSystemId, literalSystemId,
               expandedSystemId);
      setScanner(fContentScanner);
      setScannerState(STATE_CONTENT);
      try {
         do {
            fScanner.scan(false);
         } while (fScannerState != STATE_END_DOCUMENT);
      } catch (final IOException e) {
         // ignore
      }
      setScanner(previousScanner);
      setScannerState(previousScannerState);
      fCurrentEntity = previousEntity;
   } // evaluateInputSource(XMLInputSource)

   //
   // XMLComponent methods
   //

   /** Returns the base system identifier. */
   public String getBaseSystemId() {
      return fCurrentEntity != null ? fCurrentEntity.baseSystemId : null;
   } // getBaseSystemId():String

   /** Returns the character offset. */
   public int getCharacterOffset() {
      return fCurrentEntity != null ? fCurrentEntity.getCharacterOffset() : -1;
   } // getCharacterOffset():int

   /** Returns the current column number. */
   public int getColumnNumber() {
      return fCurrentEntity != null ? fCurrentEntity.getColumnNumber() : -1;
   } // getColumnNumber():int

   /** Returns the document handler. */
   public XMLDocumentHandler getDocumentHandler() {
      return fDocumentHandler;
   } // getDocumentHandler():XMLDocumentHandler

   /** Returns the encoding. */
   public String getEncoding() {
      return fCurrentEntity != null ? fCurrentEntity.encoding : null;
   } // getEncoding():String

   //
   // XMLDocumentScanner methods
   //

   /** Returns the expanded system identifier. */
   public String getExpandedSystemId() {
      return fCurrentEntity != null ? fCurrentEntity.expandedSystemId : null;
   } // getExpandedSystemId():String

   /** Returns the default state for a feature. */
   public Boolean getFeatureDefault(String featureId) {
      int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0;
      for (int i = 0; i < length; i++) {
         if (RECOGNIZED_FEATURES[i].equals(featureId)) {
            return RECOGNIZED_FEATURES_DEFAULTS[i];
         }
      }
      return null;
   } // getFeatureDefault(String):Boolean

   /** Returns the current line number. */
   public int getLineNumber() {
      return fCurrentEntity != null ? fCurrentEntity.getLineNumber() : -1;
   } // getLineNumber():int

   // @since Xerces 2.1.0

   /** Returns the literal system identifier. */
   public String getLiteralSystemId() {
      return fCurrentEntity != null ? fCurrentEntity.literalSystemId : null;
   } // getLiteralSystemId():String

   //
   // Protected static methods
   //

   /** Returns the default state for a property. */
   public Object getPropertyDefault(String propertyId) {
      int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0;
      for (int i = 0; i < length; i++) {
         if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
            return RECOGNIZED_PROPERTIES_DEFAULTS[i];
         }
      }
      return null;
   } // getPropertyDefault(String):Object

   /** Returns the public identifier. */
   public String getPublicId() {
      return fCurrentEntity != null ? fCurrentEntity.publicId : null;
   } // getPublicId():String

   /** Returns recognized features. */
   public String[] getRecognizedFeatures() {
      return RECOGNIZED_FEATURES;
   } // getRecognizedFeatures():String[]

   /** Returns recognized properties. */
   public String[] getRecognizedProperties() {
      return RECOGNIZED_PROPERTIES;
   } // getRecognizedProperties():String[]

   /** Returns the XML version. */
   public String getXMLVersion() {
      return fCurrentEntity != null ? fCurrentEntity.version : null;
   } // getXMLVersion():String

   /**
    * Pushes an input source onto the current entity stack. This enables the scanner to
    * transparently scan new content (e.g. the output written by an embedded script). At the end of
    * the current entity, the scanner returns where it left off at the time this entity source was
    * pushed.
    * <p>
    * <strong>Note:</strong> This functionality is experimental at this time and is subject to
    * change in future releases of NekoHTML.
    *
    * @param inputSource The new input source to start scanning.
    * @see #evaluateInputSource(XMLInputSource)
    */
   public void pushInputSource(XMLInputSource inputSource) {
      final Reader reader = getReader(inputSource);

      fCurrentEntityStack.push(fCurrentEntity);
      String encoding = inputSource.getEncoding();
      String publicId = inputSource.getPublicId();
      String baseSystemId = inputSource.getBaseSystemId();
      String literalSystemId = inputSource.getSystemId();
      String expandedSystemId = expandSystemId(literalSystemId, baseSystemId);
      fCurrentEntity = new CurrentEntity(reader, encoding, publicId, baseSystemId, literalSystemId,
               expandedSystemId);
   } // pushInputSource(XMLInputSource)

   //
   // Protected methods
   //

   /** Resets the component. */
   public void reset(XMLComponentManager manager) throws XMLConfigurationException {

      // get features
      fAugmentations = manager.getFeature(AUGMENTATIONS);
      fReportErrors = manager.getFeature(REPORT_ERRORS);
      fNotifyCharRefs = manager.getFeature(NOTIFY_CHAR_REFS);
      fNotifyXmlBuiltinRefs = manager.getFeature(NOTIFY_XML_BUILTIN_REFS);
      fNotifyHtmlBuiltinRefs = manager.getFeature(NOTIFY_HTML_BUILTIN_REFS);
      fFixWindowsCharRefs = manager.getFeature(FIX_MSWINDOWS_REFS);
      fScriptStripCDATADelims = manager.getFeature(SCRIPT_STRIP_CDATA_DELIMS);
      fScriptStripCommentDelims = manager.getFeature(SCRIPT_STRIP_COMMENT_DELIMS);
      fStyleStripCDATADelims = manager.getFeature(STYLE_STRIP_CDATA_DELIMS);
      fStyleStripCommentDelims = manager.getFeature(STYLE_STRIP_COMMENT_DELIMS);
      fIgnoreSpecifiedCharset = manager.getFeature(IGNORE_SPECIFIED_CHARSET);
      fCDATASections = manager.getFeature(CDATA_SECTIONS);
      fOverrideDoctype = manager.getFeature(OVERRIDE_DOCTYPE);
      fInsertDoctype = manager.getFeature(INSERT_DOCTYPE);
      fNormalizeAttributes = manager.getFeature(NORMALIZE_ATTRIBUTES);
      fParseNoScriptContent = manager.getFeature(PARSE_NOSCRIPT_CONTENT);
      fAllowSelfclosingIframe = manager.getFeature(ALLOW_SELFCLOSING_IFRAME);

      // get properties
      fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS)));
      fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS)));
      fDefaultIANAEncoding = String.valueOf(manager.getProperty(DEFAULT_ENCODING));
      fErrorReporter = (HTMLErrorReporter) manager.getProperty(ERROR_REPORTER);
      fDoctypePubid = String.valueOf(manager.getProperty(DOCTYPE_PUBID));
      fDoctypeSysid = String.valueOf(manager.getProperty(DOCTYPE_SYSID));

   } // reset(XMLComponentManager)

   // debugging

   /** Scans the document. */
   public boolean scanDocument(boolean complete) throws XNIException, IOException {
      do {
         if (!fScanner.scan(complete)) {
            return false;
         }
      } while (complete);
      return true;
   } // scanDocument(boolean):boolean

   /** Sets the document handler. */
   public void setDocumentHandler(XMLDocumentHandler handler) {
      fDocumentHandler = handler;
   } // setDocumentHandler(XMLDocumentHandler)

   // scanning

   /** Sets a feature. */
   public void setFeature(String featureId, boolean state) throws XMLConfigurationException {

      if (featureId.equals(AUGMENTATIONS)) {
         fAugmentations = state;
      } else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) {
         fIgnoreSpecifiedCharset = state;
      } else if (featureId.equals(NOTIFY_CHAR_REFS)) {
         fNotifyCharRefs = state;
      } else if (featureId.equals(NOTIFY_XML_BUILTIN_REFS)) {
         fNotifyXmlBuiltinRefs = state;
      } else if (featureId.equals(NOTIFY_HTML_BUILTIN_REFS)) {
         fNotifyHtmlBuiltinRefs = state;
      } else if (featureId.equals(FIX_MSWINDOWS_REFS)) {
         fFixWindowsCharRefs = state;
      } else if (featureId.equals(SCRIPT_STRIP_CDATA_DELIMS)) {
         fScriptStripCDATADelims = state;
      } else if (featureId.equals(SCRIPT_STRIP_COMMENT_DELIMS)) {
         fScriptStripCommentDelims = state;
      } else if (featureId.equals(STYLE_STRIP_CDATA_DELIMS)) {
         fStyleStripCDATADelims = state;
      } else if (featureId.equals(STYLE_STRIP_COMMENT_DELIMS)) {
         fStyleStripCommentDelims = state;
      } else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) {
         fIgnoreSpecifiedCharset = state;
      } else if (featureId.equals(PARSE_NOSCRIPT_CONTENT)) {
         fParseNoScriptContent = state;
      } else if (featureId.equals(ALLOW_SELFCLOSING_IFRAME)) {
         fAllowSelfclosingIframe = state;
      }

   } // setFeature(String,boolean)

   /** Sets the input source. */
   public void setInputSource(XMLInputSource source) throws IOException {

      // reset state
      fElementCount = 0;
      fElementDepth = -1;
      fByteStream = null;
      fCurrentEntityStack.removeAllElements();

      fBeginLineNumber = 1;
      fBeginColumnNumber = 1;
      fBeginCharacterOffset = 0;
      fEndLineNumber = fBeginLineNumber;
      fEndColumnNumber = fBeginColumnNumber;
      fEndCharacterOffset = fBeginCharacterOffset;

      // reset encoding information
      fIANAEncoding = fDefaultIANAEncoding;
      fJavaEncoding = fIANAEncoding;

      // get location information
      String encoding = source.getEncoding();
      String publicId = source.getPublicId();
      String baseSystemId = source.getBaseSystemId();
      String literalSystemId = source.getSystemId();
      String expandedSystemId = expandSystemId(literalSystemId, baseSystemId);

      // open stream
      Reader reader = source.getCharacterStream();
      if (reader == null) {
         InputStream inputStream = source.getByteStream();
         if (inputStream == null) {
            URL url = new URL(expandedSystemId);
            inputStream = url.openStream();
         }
         fByteStream = new PlaybackInputStream(inputStream);
         String[] encodings = new String[2];
         if (encoding == null) {
            fByteStream.detectEncoding(encodings);
         } else {
            encodings[0] = encoding;
         }
         if (encodings[0] == null) {
            encodings[0] = fDefaultIANAEncoding;
            if (fReportErrors) {
               fErrorReporter.reportWarning("HTML1000", null);
            }
         }
         if (encodings[1] == null) {
            encodings[1] = EncodingMap.getIANA2JavaMapping(encodings[0].toUpperCase());
            if (encodings[1] == null) {
               encodings[1] = encodings[0];
               if (fReportErrors) {
                  fErrorReporter.reportWarning("HTML1001", new Object[]{encodings[0]});
               }
            }
         }
         fIANAEncoding = encodings[0];
         fJavaEncoding = encodings[1];
         /* PATCH: Asgeir Asgeirsson */
         fIso8859Encoding = fIANAEncoding == null
                  || fIANAEncoding.toUpperCase().startsWith("ISO-8859")
                  || fIANAEncoding.equalsIgnoreCase(fDefaultIANAEncoding);
         encoding = fIANAEncoding;
         reader = new InputStreamReader(fByteStream, fJavaEncoding);
      }
      fCurrentEntity = new CurrentEntity(reader, encoding, publicId, baseSystemId, literalSystemId,
               expandedSystemId);

      // set scanner and state
      setScanner(fContentScanner);
      setScannerState(STATE_START_DOCUMENT);

   } // setInputSource(XMLInputSource)

   /** Sets a property. */
   public void setProperty(String propertyId, Object value) throws XMLConfigurationException {

      if (propertyId.equals(NAMES_ELEMS)) {
         fNamesElems = getNamesValue(String.valueOf(value));
         return;
      }

      if (propertyId.equals(NAMES_ATTRS)) {
         fNamesAttrs = getNamesValue(String.valueOf(value));
         return;
      }

      if (propertyId.equals(DEFAULT_ENCODING)) {
         fDefaultIANAEncoding = String.valueOf(value);
         return;
      }

   } // setProperty(String,Object)

   /**
    * Fixes Microsoft Windows&reg; specific characters.
    * <p>
    * Details about this common problem can be found at <a href=
    * 'http://www.cs.tut.fi/~jkorpela/www/windows-chars.html'>http://www.cs.tut.fi/~jkorpela/www/windows-chars.h
    * t m l < / a >
    */
   protected int fixWindowsCharacter(int origChar) {
      /* PATCH: Asgeir Asgeirsson */
      switch (origChar) {
         case 130:
            return 8218;
         case 131:
            return 402;
         case 132:
            return 8222;
         case 133:
            return 8230;
         case 134:
            return 8224;
         case 135:
            return 8225;
         case 136:
            return 710;
         case 137:
            return 8240;
         case 138:
            return 352;
         case 139:
            return 8249;
         case 140:
            return 338;
         case 145:
            return 8216;
         case 146:
            return 8217;
         case 147:
            return 8220;
         case 148:
            return 8221;
         case 149:
            return 8226;
         case 150:
            return 8211;
         case 151:
            return 8212;
         case 152:
            return 732;
         case 153:
            return 8482;
         case 154:
            return 353;
         case 155:
            return 8250;
         case 156:
            return 339;
         case 159:
            return 376;
      }
      return origChar;
   } // fixWindowsCharacter(int):int

   /** Returns an augmentations object with a location item added. */
   protected final Augmentations locationAugs() {
      HTMLAugmentations augs = null;
      if (fAugmentations) {
         fLocationItem.setValues(fBeginLineNumber, fBeginColumnNumber, fBeginCharacterOffset,
                  fEndLineNumber, fEndColumnNumber, fEndCharacterOffset);
         augs = fInfosetAugs;
         augs.removeAllItems();
         augs.putItem(AUGMENTATIONS, fLocationItem);
      }
      return augs;
   } // locationAugs():Augmentations

   // i/o
   /** Reads a single character. */
   protected int read() throws IOException {
      return fCurrentEntity.read();
   }

   /** Reads a single character, preserving the old buffer content */
   protected int readPreservingBufferContent() throws IOException {
      fCurrentEntity.debugBufferIfNeeded("(read: ");
      if (fCurrentEntity.offset == fCurrentEntity.length) {
         if (fCurrentEntity.load(fCurrentEntity.length) < 1) {
            if (DEBUG_BUFFER) {
               System.out.println(")read: -> -1");
            }
            return -1;
         }
      }
      final char c = fCurrentEntity.getNextChar();
      fCurrentEntity.debugBufferIfNeeded(")read: ", " -> " + c);
      return c;
   } // readPreservingBufferContent():int

   /** Returns an empty resource identifier. */
   protected final XMLResourceIdentifier resourceId() {
      /***/
      fResourceId.clear();
      return fResourceId;
      /***
       * // NOTE: Unfortunately, the Xerces DOM parser classes expect a // non-null resource
       * identifier object to be passed to // startGeneralEntity. -Ac return null; /
       ***/
   } // resourceId():XMLResourceIdentifier

   // infoset utility methods

   /** Scans a DOCTYPE line. */
   protected void scanDoctype() throws IOException {
      String root = null;
      String pubid = null;
      String sysid = null;

      if (skipSpaces()) {
         root = scanName();
         if (root == null) {
            if (fReportErrors) {
               fErrorReporter.reportError("HTML1014", null);
            }
         } else {
            root = modifyName(root, fNamesElems);
         }
         if (skipSpaces()) {
            if (skip("PUBLIC", false)) {
               skipSpaces();
               pubid = scanLiteral();
               if (skipSpaces()) {
                  sysid = scanLiteral();
               }
            } else if (skip("SYSTEM", false)) {
               skipSpaces();
               sysid = scanLiteral();
            }
         }
      }
      int c;
      while ((c = fCurrentEntity.read()) != -1) {
         if (c == '<') {
            fCurrentEntity.rewind();
            break;
         }
         if (c == '>') {
            break;
         }
         if (c == '[') {
            skipMarkup(true);
            break;
         }
      }

      if (fDocumentHandler != null) {
         if (fOverrideDoctype) {
            pubid = fDoctypePubid;
            sysid = fDoctypeSysid;
         }
         fEndLineNumber = fCurrentEntity.getLineNumber();
         fEndColumnNumber = fCurrentEntity.getColumnNumber();
         fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
         fDocumentHandler.doctypeDecl(root, pubid, sysid, locationAugs());
      }

   } // scanDoctype()

   /** Scans an entity reference. */
   protected int scanEntityRef(final XMLStringBuffer str, final boolean content) throws IOException {
      str.clear();
      str.append('&');
      boolean endsWithSemicolon = false;
      while (true) {
         int c = fCurrentEntity.read();
         if (c == ';') {
            str.append(';');
            endsWithSemicolon = true;
            break;
         } else if (c == -1) {
            break;
         } else if (!ENTITY_CHARS.get(c) && c != '#') {
            fCurrentEntity.rewind();
            break;
         }
         str.append((char) c);
      }

      if (!endsWithSemicolon) {
         if (fReportErrors) {
            fErrorReporter.reportWarning("HTML1004", null);
         }
      }
      if (str.length == 1) {
         if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
            fEndLineNumber = fCurrentEntity.getLineNumber();
            fEndColumnNumber = fCurrentEntity.getColumnNumber();
            fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
            fDocumentHandler.characters(str, locationAugs());
         }
         return -1;
      }

      final String name;
      if (endsWithSemicolon)
         name = str.toString().substring(1, str.length - 1);
      else
         name = str.toString().substring(1);

      if (name.startsWith("#")) {
         int value = -1;
         try {
            if (name.startsWith("#x") || name.startsWith("#X")) {
               value = Integer.parseInt(name.substring(2), 16);
            } else {
               value = Integer.parseInt(name.substring(1));
            }
            /* PATCH: Asgeir Asgeirsson */
            if (fFixWindowsCharRefs && fIso8859Encoding) {
               value = fixWindowsCharacter(value);
            }
            if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
               fEndLineNumber = fCurrentEntity.getLineNumber();
               fEndColumnNumber = fCurrentEntity.getColumnNumber();
               fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
               if (fNotifyCharRefs) {
                  XMLResourceIdentifier id = resourceId();
                  String encoding = null;
                  fDocumentHandler.startGeneralEntity(name, id, encoding, locationAugs());
               }
               str.clear();
               str.append((char) value);
               fDocumentHandler.characters(str, locationAugs());
               if (fNotifyCharRefs) {
                  fDocumentHandler.endGeneralEntity(name, locationAugs());
               }
            }
         } catch (NumberFormatException e) {
            if (fReportErrors) {
               fErrorReporter.reportError("HTML1005", new Object[]{name});
            }
            if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
               fEndLineNumber = fCurrentEntity.getLineNumber();
               fEndColumnNumber = fCurrentEntity.getColumnNumber();
               fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
               fDocumentHandler.characters(str, locationAugs());
            }
         }
         return value;
      }

      int c = HTMLEntities.get(name);
      // in attributes, some incomplete entities should be recognized, not all
      // TODO: investigate to find which ones (there are differences between
      // browsers)
      // in a first time, consider only those that behave the same in FF and IE
      final boolean invalidEntityInAttribute = !content && !endsWithSemicolon && c > 256;
      if (c == -1 || invalidEntityInAttribute) {
         if (fReportErrors) {
            fErrorReporter.reportWarning("HTML1006", new Object[]{name});
         }
         if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
            fEndLineNumber = fCurrentEntity.getLineNumber();
            fEndColumnNumber = fCurrentEntity.getColumnNumber();
            fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
            fDocumentHandler.characters(str, locationAugs());
         }
         return -1;
      }
      if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
         fEndLineNumber = fCurrentEntity.getLineNumber();
         fEndColumnNumber = fCurrentEntity.getColumnNumber();
         fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
         boolean notify = fNotifyHtmlBuiltinRefs || (fNotifyXmlBuiltinRefs && builtinXmlRef(name));
         if (notify) {
            XMLResourceIdentifier id = resourceId();
            String encoding = null;
            fDocumentHandler.startGeneralEntity(name, id, encoding, locationAugs());
         }
         str.clear();
         str.append((char) c);
         fDocumentHandler.characters(str, locationAugs());
         if (notify) {
            fDocumentHandler.endGeneralEntity(name, locationAugs());
         }
      }
      return c;

   } // scanEntityRef(XMLStringBuffer,boolean):int

   /** Scans a quoted literal. */
   protected String scanLiteral() throws IOException {
      int quote = fCurrentEntity.read();
      if (quote == '\'' || quote == '"') {
         StringBuffer str = new StringBuffer();
         int c;
         while ((c = fCurrentEntity.read()) != -1) {
            if (c == quote) {
               break;
            }
            if (c == '\r' || c == '\n') {
               fCurrentEntity.rewind();
               // NOTE: This collapses newlines to a single space.
               // [Q] Is this the right thing to do here? -Ac
               skipNewlines();
               str.append(' ');
            } else if (c == '<') {
               fCurrentEntity.rewind();
               break;
            } else {
               str.append((char) c);
            }
         }
         if (c == -1) {
            if (fReportErrors) {
               fErrorReporter.reportError("HTML1007", null);
            }
            throw new EOFException();
         }
         return str.toString();
      } else {
         fCurrentEntity.rewind();
      }
      return null;
   } // scanLiteral():String

   //
   // Protected static methods
   //

   /** Scans a name. */
   protected String scanName() throws IOException {
      fCurrentEntity.debugBufferIfNeeded("(scanName: ");
      if (fCurrentEntity.offset == fCurrentEntity.length) {
         if (fCurrentEntity.load(0) == -1) {
            fCurrentEntity.debugBufferIfNeeded(")scanName: ");
            return null;
         }
      }
      int offset = fCurrentEntity.offset;
      while (true) {
         while (fCurrentEntity.hasNext()) {
            char c = fCurrentEntity.getNextChar();
            if (!Character.isLetterOrDigit(c) && !(c == '-' || c == '.' || c == ':' || c == '_')) {
               fCurrentEntity.rewind();
               break;
            }
         }
         if (fCurrentEntity.offset == fCurrentEntity.length) {
            int length = fCurrentEntity.length - offset;
            System.arraycopy(fCurrentEntity.buffer, offset, fCurrentEntity.buffer, 0, length);
            int count = fCurrentEntity.load(length);
            offset = 0;
            if (count == -1) {
               break;
            }
         } else {
            break;
         }
      }
      int length = fCurrentEntity.offset - offset;
      String name = length > 0 ? new String(fCurrentEntity.buffer, offset, length) : null;
      fCurrentEntity.debugBufferIfNeeded(")scanName: ", " -> \"" + name + '"');
      return name;
   } // scanName():String

   //
   // Private methods
   //

   //
   // Interfaces
   //

   /** Sets the scanner. */
   protected void setScanner(Scanner scanner) {
      fScanner = scanner;
      if (DEBUG_SCANNER) {
         System.out.print("$$$ setScanner(");
         System.out.print(scanner != null ? scanner.getClass().getName() : "null");
         System.out.println(");");
      }
   } // setScanner(Scanner)

   //
   // Classes
   //

   /** Sets the scanner state. */
   protected void setScannerState(short state) {
      fScannerState = state;
      if (DEBUG_SCANNER_STATE) {
         System.out.print("$$$ setScannerState(");
         switch (fScannerState) {
            case STATE_CONTENT: {
               System.out.print("STATE_CONTENT");
               break;
            }
            case STATE_MARKUP_BRACKET: {
               System.out.print("STATE_MARKUP_BRACKET");
               break;
            }
            case STATE_START_DOCUMENT: {
               System.out.print("STATE_START_DOCUMENT");
               break;
            }
            case STATE_END_DOCUMENT: {
               System.out.print("STATE_END_DOCUMENT");
               break;
            }
         }
         System.out.println(");");
      }
   } // setScannerState(short)

   /** Returns true if the specified text is present and is skipped. */
   protected boolean skip(String s, boolean caseSensitive) throws IOException {
      int length = s != null ? s.length() : 0;
      for (int i = 0; i < length; i++) {
         if (fCurrentEntity.offset == fCurrentEntity.length) {
            System.arraycopy(fCurrentEntity.buffer, fCurrentEntity.offset - i,
                     fCurrentEntity.buffer, 0, i);
            if (fCurrentEntity.load(i) == -1) {
               fCurrentEntity.offset = 0;
               return false;
            }
         }
         char c0 = s.charAt(i);
         char c1 = fCurrentEntity.getNextChar();
         if (!caseSensitive) {
            c0 = Character.toUpperCase(c0);
            c1 = Character.toUpperCase(c1);
         }
         if (c0 != c1) {
            fCurrentEntity.rewind(i + 1);
            return false;
         }
      }
      return true;
   } // skip(String):boolean

   /** Skips markup. */
   protected boolean skipMarkup(boolean balance) throws IOException {
      fCurrentEntity.debugBufferIfNeeded("(skipMarkup: ");
      int depth = 1;
      boolean slashgt = false;
      OUTER : while (true) {
         if (fCurrentEntity.offset == fCurrentEntity.length) {
            if (fCurrentEntity.load(0) == -1) {
               break OUTER;
            }
         }
         while (fCurrentEntity.hasNext()) {
            char c = fCurrentEntity.getNextChar();
            if (balance && c == '<') {
               depth++;
            } else if (c == '>') {
               depth--;
               if (depth == 0) {
                  break OUTER;
               }
            } else if (c == '/') {
               if (fCurrentEntity.offset == fCurrentEntity.length) {
                  if (fCurrentEntity.load(0) == -1) {
                     break OUTER;
                  }
               }
               c = fCurrentEntity.getNextChar();
               if (c == '>') {
                  slashgt = true;
                  depth--;
                  if (depth == 0) {
                     break OUTER;
                  }
               } else {
                  fCurrentEntity.rewind();
               }
            } else if (c == '\r' || c == '\n') {
               fCurrentEntity.rewind();
               skipNewlines();
            }
         }
      }
      fCurrentEntity.debugBufferIfNeeded(")skipMarkup: ", " -> " + slashgt);
      return slashgt;
   } // skipMarkup():boolean

   /** Skips newlines and returns the number of newlines skipped. */
   protected int skipNewlines() throws IOException {
      fCurrentEntity.debugBufferIfNeeded("(skipNewlines: ");

      if (!fCurrentEntity.hasNext()) {
         if (fCurrentEntity.load(0) == -1) {
            fCurrentEntity.debugBufferIfNeeded(")skipNewlines: ");
            return 0;
         }
      }
      char c = fCurrentEntity.getCurrentChar();
      int newlines = 0;
      int offset = fCurrentEntity.offset;
      if (c == '\n' || c == '\r') {
         do {
            c = fCurrentEntity.getNextChar();
            if (c == '\r') {
               newlines++;
               if (fCurrentEntity.offset == fCurrentEntity.length) {
                  offset = 0;
                  fCurrentEntity.offset = newlines;
                  if (fCurrentEntity.load(newlines) == -1) {
                     break;
                  }
               }
               if (fCurrentEntity.getCurrentChar() == '\n') {
                  fCurrentEntity.offset++;
                  fCurrentEntity.characterOffset_++;
                  offset++;
               }
            } else if (c == '\n') {
               newlines++;
               if (fCurrentEntity.offset == fCurrentEntity.length) {
                  offset = 0;
                  fCurrentEntity.offset = newlines;
                  if (fCurrentEntity.load(newlines) == -1) {
                     break;
                  }
               }
            } else {
               fCurrentEntity.rewind();
               break;
            }
         } while (fCurrentEntity.offset < fCurrentEntity.length - 1);
         fCurrentEntity.incLine(newlines);
      }
      fCurrentEntity.debugBufferIfNeeded(")skipNewlines: ", " -> " + newlines);
      return newlines;
   } // skipNewlines(int):int

   /** Skips whitespace. */
   protected boolean skipSpaces() throws IOException {
      fCurrentEntity.debugBufferIfNeeded("(skipSpaces: ");
      boolean spaces = false;
      while (true) {
         if (fCurrentEntity.offset == fCurrentEntity.length) {
            if (fCurrentEntity.load(0) == -1) {
               break;
            }
         }
         char c = fCurrentEntity.getNextChar();
         if (!Character.isWhitespace(c)) {
            fCurrentEntity.rewind();
            break;
         }
         spaces = true;
         if (c == '\r' || c == '\n') {
            fCurrentEntity.rewind();
            skipNewlines();
            continue;
         }
      }
      fCurrentEntity.debugBufferIfNeeded(")skipSpaces: ", " -> " + spaces);
      return spaces;
   } // skipSpaces()

   /** Returns an augmentations object with a synthesized item added. */
   protected final Augmentations synthesizedAugs() {
      HTMLAugmentations augs = null;
      if (fAugmentations) {
         augs = fInfosetAugs;
         augs.removeAllItems();
         augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
      }
      return augs;
   } // synthesizedAugs():Augmentations

   /**
    * To detect if 2 encoding are compatible, both must be able to read the meta tag specifying the
    * new encoding. This means that the byte representation of some minimal html markup must be the
    * same in both encodings
    */
   boolean isEncodingCompatible(final String encoding1, final String encoding2) {
      final String reference = "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=";
      try {
         final byte[] bytesEncoding1 = reference.getBytes(encoding1);
         final String referenceWithEncoding2 = new String(bytesEncoding1, encoding2);
         return reference.equals(referenceWithEncoding2);
      } catch (final UnsupportedEncodingException e) {
         return false;
      }
   }

   /**
    * Indicates if the end comment --> is available, loading further data if needed, without to
    * reset the buffer
    */
   private boolean endCommentAvailable() throws IOException {
      int nbCaret = 0;
      final int originalOffset = fCurrentEntity.offset;
      final int originalColumnNumber = fCurrentEntity.getColumnNumber();
      final int originalCharacterOffset = fCurrentEntity.getCharacterOffset();

      while (true) {
         int c = readPreservingBufferContent();
         if (c == -1) {
            fCurrentEntity.restorePosition(originalOffset, originalColumnNumber,
                     originalCharacterOffset);
            return false;
         } else if (c == '>' && nbCaret >= 2) {
            fCurrentEntity.restorePosition(originalOffset, originalColumnNumber,
                     originalCharacterOffset);
            return true;
         } else if (c == '-') {
            nbCaret++;
         } else {
            nbCaret = 0;
         }
      }
   }

   private boolean endsWith(final XMLStringBuffer buffer, final String string) {
      final int l = string.length();
      if (buffer.length < l) {
         return false;
      } else {
         final String s = new String(buffer.ch, buffer.length - l, l);
         return string.equals(s);
      }
   }

   private Reader getReader(final XMLInputSource inputSource) {
      Reader reader = inputSource.getCharacterStream();
      if (reader == null) {
         try {
            return new InputStreamReader(inputSource.getByteStream(), fJavaEncoding);
         } catch (final UnsupportedEncodingException e) {
            // should not happen as this encoding is already used to parse the
            // "main" source
         }
      }
      return reader;
   }
} // class HTMLScanner
TOP

Related Classes of com.googlecode.html.HTMLScanner$ContentScanner

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.