Package org.cyberneko.html

Source Code of org.cyberneko.html.HTMLTagBalancer$ElementEntry

/*
* Copyright 2002-2009 Andy Clark, Marc Guillemot
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.cyberneko.html;

import java.util.ArrayList;
import java.util.List;

import org.apache.xerces.util.XMLAttributesImpl;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLDocumentHandler;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLComponentManager;
import org.apache.xerces.xni.parser.XMLConfigurationException;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.apache.xerces.xni.parser.XMLDocumentSource;
import org.cyberneko.html.HTMLElements.Element;
import org.cyberneko.html.filters.NamespaceBinder;
import org.cyberneko.html.xercesbridge.XercesBridge;
                     
/**
* Balances tags in an HTML document. This component receives document events
* and tries to correct many common mistakes that human (and computer) HTML
* document authors make. This tag balancer can:
* <ul>
* <li>add missing parent elements;
* <li>automatically close elements with optional end tags; and
* <li>handle mis-matched inline element tags.
* </ul>
* <p>
* This component recognizes the following features:
* <ul>
* <li>http://cyberneko.org/html/features/augmentations
* <li>http://cyberneko.org/html/features/report-errors
* <li>http://cyberneko.org/html/features/balance-tags/document-fragment
* <li>http://cyberneko.org/html/features/balance-tags/ignore-outside-content
* </ul>
* <p>
* This component recognizes the following properties:
* <ul>
* <li>http://cyberneko.org/html/properties/names/elems
* <li>http://cyberneko.org/html/properties/names/attrs
* <li>http://cyberneko.org/html/properties/error-reporter
* <li>http://cyberneko.org/html/properties/balance-tags/current-stack
* </ul>
*
* @see HTMLElements
*
* @author Andy Clark
* @author Marc Guillemot
*
* @version $Id: HTMLTagBalancer.java,v 1.20 2005/02/14 04:06:22 andyc Exp $
*/
public class HTMLTagBalancer
    implements XMLDocumentFilter, HTMLComponent {

    //
    // Constants
    //

    // features

    /** Namespaces. */
    protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";

    /** Include infoset augmentations. */
    protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";

    /** Report errors. */
    protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";

    /** Document fragment balancing only (deprecated). */
    protected static final String DOCUMENT_FRAGMENT_DEPRECATED = "http://cyberneko.org/html/features/document-fragment";

    /** Document fragment balancing only. */
    protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/balance-tags/document-fragment";

    /** Ignore outside content. */
    protected static final String IGNORE_OUTSIDE_CONTENT = "http://cyberneko.org/html/features/balance-tags/ignore-outside-content";

    /** Recognized features. */
    private static final String[] RECOGNIZED_FEATURES = {
        NAMESPACES,
        AUGMENTATIONS,
        REPORT_ERRORS,
        DOCUMENT_FRAGMENT_DEPRECATED,
        DOCUMENT_FRAGMENT,
        IGNORE_OUTSIDE_CONTENT,
    };

    /** Recognized features defaults. */
    private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
        null,
        null,
        null,
        null,
        Boolean.FALSE,
        Boolean.FALSE,
    };

    // properties

    /** Modify HTML element names: { "upper", "lower", "default" }. */
    protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";

    /** Modify HTML attribute names: { "upper", "lower", "default" }. */
    protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
   
    /** Error reporter. */
    protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";

    /**
     * <font color="red">EXPERIMENTAL: may change in next release</font><br/>
     * Name of the property holding the stack of elements in which context a document fragment should be parsed.
     **/
    public static final String FRAGMENT_CONTEXT_STACK = "http://cyberneko.org/html/properties/balance-tags/fragment-context-stack";

    /** Recognized properties. */
    private static final String[] RECOGNIZED_PROPERTIES = {
        NAMES_ELEMS,
        NAMES_ATTRS,
        ERROR_REPORTER,
        FRAGMENT_CONTEXT_STACK,
    };

    /** Recognized properties defaults. */
    private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {
        null,
        null,
        null,
        null,
    };

    // modify HTML names

    /** Don't modify HTML names. */
    protected static final short NAMES_NO_CHANGE = 0;

    /** Match HTML element names. */
    protected static final short NAMES_MATCH = 0;

    /** Uppercase HTML names. */
    protected static final short NAMES_UPPERCASE = 1;

    /** Lowercase HTML names. */
    protected static final short NAMES_LOWERCASE = 2;

    // static vars

    /** Synthesized event info item. */
    protected static final HTMLEventInfo SYNTHESIZED_ITEM =
        new HTMLEventInfo.SynthesizedItem();

    //
    // Data
    //

    // features

    /** Namespaces. */
    protected boolean fNamespaces;

    /** Include infoset augmentations. */
    protected boolean fAugmentations;
   
    /** Report errors. */
    protected boolean fReportErrors;

    /** Document fragment balancing only. */
    protected boolean fDocumentFragment;

    /** Ignore outside content. */
    protected boolean fIgnoreOutsideContent;

    /** Allows self closing iframe tags. */
    protected boolean fAllowSelfclosingIframe;

    // properties

    /** Modify HTML element names. */
    protected short fNamesElems;

    /** Modify HTML attribute names. */
    protected short fNamesAttrs;

    /** Error reporter. */
    protected HTMLErrorReporter fErrorReporter;

    // connections

    /** The document source. */
    protected XMLDocumentSource fDocumentSource;

    /** The document handler. */
    protected XMLDocumentHandler fDocumentHandler;

    // state

    /** The element stack. */
    protected final InfoStack fElementStack = new InfoStack();

    /** The inline stack. */
    protected final InfoStack fInlineStack = new InfoStack();

    /** True if seen anything. Important for xml declaration. */
    protected boolean fSeenAnything;

    /** True if root element has been seen. */
    protected boolean fSeenDoctype;

    /** True if root element has been seen. */
    protected boolean fSeenRootElement;

    /**
     * True if seen the end of the document element. In other words,
     * this variable is set to false <em>until</em> the end &lt;/HTML&gt;
     * tag is seen (or synthesized). This is used to ensure that
     * extraneous events after the end of the document element do not
     * make the document stream ill-formed.
     */
    protected boolean fSeenRootElementEnd;

    /** True if seen &lt;head&lt; element. */
    protected boolean fSeenHeadElement;

    /** True if seen &lt;body&lt; element. */
    protected boolean fSeenBodyElement;

    /** True if a form is in the stack (allow to discard opening of nested forms) */
    protected boolean fOpenedForm;

    // temp vars

    /** A qualified name. */
    private final QName fQName = new QName();

    /** Empty attributes. */
    private final XMLAttributes fEmptyAttrs = new XMLAttributesImpl();

    /** Augmentations. */
    private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();

    protected HTMLTagBalancingListener tagBalancingListener;
    private LostText lostText_ = new LostText();

    private boolean forcedStartElement_ = false;
    private boolean forcedEndElement_ = false;

    /**
     * Stack of elements determining the context in which a document fragment should be parsed
     */
  private QName[] fragmentContextStack_ = null;
  private int fragmentContextStackSize_ = 0; // not 0 only when a fragment is parsed and fragmentContextStack_ is set

    private List/*ElementEntry*/ endElementsBuffer_ = new ArrayList();

    //
    // HTMLComponent methods
    //

    /** Returns the default state for a feature. */
    public Boolean getFeatureDefault(String featureId) {
        int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0;
        for (int i = 0; i < length; i++) {
            if (RECOGNIZED_FEATURES[i].equals(featureId)) {
                return RECOGNIZED_FEATURES_DEFAULTS[i];
            }
        }
        return null;
    } // getFeatureDefault(String):Boolean

    /** Returns the default state for a property. */
    public Object getPropertyDefault(String propertyId) {
        int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0;
        for (int i = 0; i < length; i++) {
            if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
                return RECOGNIZED_PROPERTIES_DEFAULTS[i];
            }
        }
        return null;
    } // getPropertyDefault(String):Object

    //
    // XMLComponent methods
    //

    /** Returns recognized features. */
    public String[] getRecognizedFeatures() {
        return RECOGNIZED_FEATURES;
    } // getRecognizedFeatures():String[]

    /** Returns recognized properties. */
    public String[] getRecognizedProperties() {
        return RECOGNIZED_PROPERTIES;
    } // getRecognizedProperties():String[]

    /** Resets the component. */
    public void reset(final XMLComponentManager manager)
        throws XMLConfigurationException {

        // get features
        fNamespaces = manager.getFeature(NAMESPACES);
        fAugmentations = manager.getFeature(AUGMENTATIONS);
        fReportErrors = manager.getFeature(REPORT_ERRORS);
        fDocumentFragment = manager.getFeature(DOCUMENT_FRAGMENT) ||
                            manager.getFeature(DOCUMENT_FRAGMENT_DEPRECATED);
        fIgnoreOutsideContent = manager.getFeature(IGNORE_OUTSIDE_CONTENT);
        fAllowSelfclosingIframe = manager.getFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME);

        // get properties
        fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS)));
        fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS)));
        fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER);
       
        fragmentContextStack_ = (QName[]) manager.getProperty(FRAGMENT_CONTEXT_STACK);

    } // reset(XMLComponentManager)

    /** Sets a feature. */
    public void setFeature(String featureId, boolean state)
        throws XMLConfigurationException {

        if (featureId.equals(AUGMENTATIONS)) {
            fAugmentations = state;
            return;
        }
        if (featureId.equals(REPORT_ERRORS)) {
            fReportErrors = state;
            return;
        }
        if (featureId.equals(IGNORE_OUTSIDE_CONTENT)) {
            fIgnoreOutsideContent = state;
            return;
        }

    } // setFeature(String,boolean)

    /** Sets a property. */
    public void setProperty(String propertyId, Object value)
        throws XMLConfigurationException {
   
        if (propertyId.equals(NAMES_ELEMS)) {
            fNamesElems = getNamesValue(String.valueOf(value));
            return;
        }

        if (propertyId.equals(NAMES_ATTRS)) {
            fNamesAttrs = getNamesValue(String.valueOf(value));
            return;
        }

    } // setProperty(String,Object)

    //
    // XMLDocumentSource methods
    //

    /** Sets the document handler. */
    public void setDocumentHandler(XMLDocumentHandler handler) {
        fDocumentHandler = handler;
    } // setDocumentHandler(XMLDocumentHandler)

    // @since Xerces 2.1.0

    /** Returns the document handler. */
    public XMLDocumentHandler getDocumentHandler() {
        return fDocumentHandler;
    } // getDocumentHandler():XMLDocumentHandler

    //
    // XMLDocumentHandler methods
    //

    // since Xerces-J 2.2.0

    /** Start document. */
    public void startDocument(XMLLocator locator, String encoding,
                              NamespaceContext nscontext, Augmentations augs)
        throws XNIException {

        // reset state
        fElementStack.top = 0;
        if (fragmentContextStack_ != null) {
          fragmentContextStackSize_ = fragmentContextStack_.length;
          for (int i=0; i<fragmentContextStack_.length; ++i) {
            final QName name = fragmentContextStack_[i];
              final Element elt = HTMLElements.getElement(name.localpart);
              fElementStack.push(new Info(elt, name));
          }
         
        }
        else {
          fragmentContextStackSize_ = 0;
        }
        fSeenAnything = false;
        fSeenDoctype = false;
        fSeenRootElement = false;
        fSeenRootElementEnd = false;
        fSeenHeadElement = false;
        fSeenBodyElement = false;
       

        // pass on event
        if (fDocumentHandler != null) {
          XercesBridge.getInstance().XMLDocumentHandler_startDocument(fDocumentHandler, locator, encoding, nscontext, augs);
        }
   
    } // startDocument(XMLLocator,String,Augmentations)

    // old methods

    /** XML declaration. */
    public void xmlDecl(String version, String encoding, String standalone,
                        Augmentations augs) throws XNIException {
        if (!fSeenAnything && fDocumentHandler != null) {
            fDocumentHandler.xmlDecl(version, encoding, standalone, augs);
        }
    } // xmlDecl(String,String,String,Augmentations)

    /** Doctype declaration. */
    public void doctypeDecl(String rootElementName, String publicId, String systemId,
                            Augmentations augs) throws XNIException {
        fSeenAnything = true;
        if (fReportErrors) {
            if (fSeenRootElement) {
                fErrorReporter.reportError("HTML2010", null);
            }
            else if (fSeenDoctype) {
                fErrorReporter.reportError("HTML2011", null);
            }
        }
        if (!fSeenRootElement && !fSeenDoctype) {
            fSeenDoctype = true;
            if (fDocumentHandler != null) {
                fDocumentHandler.doctypeDecl(rootElementName, publicId, systemId, augs);
            }
        }
    } // doctypeDecl(String,String,String,Augmentations)

    /** End document. */
    public void endDocument(Augmentations augs) throws XNIException {

      // </body> and </html> have been buffered to consider outside content
      fIgnoreOutsideContent = true; // endElement should not ignore the elements passed from buffer
      consumeBufferedEndElements();
     
        // handle empty document
        if (!fSeenRootElement && !fDocumentFragment) {
            if (fReportErrors) {
                fErrorReporter.reportError("HTML2000", null);
            }
            if (fDocumentHandler != null) {
              fSeenRootElementEnd = false;
                forceStartBody(); // will force <html> and <head></head>
                final String body = modifyName("body", fNamesElems);
                fQName.setValues(null, body, body, null);
                callEndElement(fQName, synthesizedAugs());

                final String ename = modifyName("html", fNamesElems);
                fQName.setValues(null, ename, ename, null);
                callEndElement(fQName, synthesizedAugs());
            }
        }

        // pop all remaining elements
        else {
            int length = fElementStack.top - fragmentContextStackSize_;
            for (int i = 0; i < length; i++) {
                Info info = fElementStack.pop();
                if (fReportErrors) {
                    String ename = info.qname.rawname;
                    fErrorReporter.reportWarning("HTML2001", new Object[]{ename});
                }
                if (fDocumentHandler != null) {
                    callEndElement(info.qname, synthesizedAugs());
                }
            }
        }

        // call handler
        if (fDocumentHandler != null) {
            fDocumentHandler.endDocument(augs);
        }

    } // endDocument(Augmentations)

    /**
     * Consume elements that have been buffered, like </body></html> that are first consumed
     * at the end of document
     */
  private void consumeBufferedEndElements() {
    final List toConsume = new ArrayList(endElementsBuffer_);
    endElementsBuffer_.clear();
    for (int i=0; i<toConsume.size(); ++i) {
        final ElementEntry entry = (ElementEntry) toConsume.get(i);
        forcedEndElement_ = true;
          endElement(entry.name_, entry.augs_);
      }
    endElementsBuffer_.clear();
  }

    /** Comment. */
    public void comment(XMLString text, Augmentations augs) throws XNIException {
        fSeenAnything = true;
        consumeEarlyTextIfNeeded();
        if (fDocumentHandler != null) {
            fDocumentHandler.comment(text, augs);
        }
    } // comment(XMLString,Augmentations)

  private void consumeEarlyTextIfNeeded() {
    if (!lostText_.isEmpty()) {
          if (!fSeenBodyElement) {
            forceStartBody();
          }
            lostText_.refeed(this);
        }
  }

    /** Processing instruction. */
    public void processingInstruction(String target, XMLString data,
                                      Augmentations augs) throws XNIException {
        fSeenAnything = true;
        consumeEarlyTextIfNeeded();
        if (fDocumentHandler != null) {
            fDocumentHandler.processingInstruction(target, data, augs);
        }
    } // processingInstruction(String,XMLString,Augmentations)

    /** Start element. */
    public void startElement(final QName elem, XMLAttributes attrs, final Augmentations augs)
        throws XNIException {
        fSeenAnything = true;
       
        final boolean isForcedCreation = forcedStartElement_;
        forcedStartElement_ = false;

        // check for end of document
        if (fSeenRootElementEnd) {
          notifyDiscardedStartElement(elem, attrs, augs);
            return;
        }

        // get element information
        final HTMLElements.Element element = getElement(elem);
        final short elementCode = element.code;

        // the creation of some elements like TABLE or SELECT can't be forced. Any others?
        if (isForcedCreation && (elementCode == HTMLElements.TABLE || elementCode == HTMLElements.SELECT)) {
          return; // don't accept creation
        }

        // ignore multiple html, head, body elements
    if (fSeenRootElement && elementCode == HTMLElements.HTML) {
          notifyDiscardedStartElement(elem, attrs, augs);
            return;
        }
        if (elementCode == HTMLElements.HEAD) {
            if (fSeenHeadElement) {
              notifyDiscardedStartElement(elem, attrs, augs);
                return;
            }
            fSeenHeadElement = true;
        }
        else if (elementCode == HTMLElements.FRAMESET) {
          consumeBufferedEndElements(); // </head> (if any) has been buffered
        }
        else if (elementCode == HTMLElements.BODY) {
        // create <head></head> if none was present
        if (!fSeenHeadElement) {
          final QName head = createQName("head");
          forceStartElement(head, null, synthesizedAugs());
          endElement(head, synthesizedAugs());
        }
          consumeBufferedEndElements(); // </head> (if any) has been buffered
       
            if (fSeenBodyElement) {
              notifyDiscardedStartElement(elem, attrs, augs);
                return;
            }
            fSeenBodyElement = true;
        }
        else if (elementCode == HTMLElements.FORM) {
          if (fOpenedForm) {
              notifyDiscardedStartElement(elem, attrs, augs);
            return;
          }
          fOpenedForm = true;
        }
        else if (elementCode == HTMLElements.UNKNOWN) {
          consumeBufferedEndElements();
        }

        // check proper parent
        if (element.parent != null) {
            final HTMLElements.Element preferedParent = element.parent[0];
          if (fDocumentFragment && (preferedParent.code == HTMLElements.HEAD || preferedParent.code == HTMLElements.BODY)) {
            // nothing, don't force HEAD or BODY creation for a document fragment
          }
          else if (!fSeenRootElement && !fDocumentFragment) {
                String pname = preferedParent.name;
                pname = modifyName(pname, fNamesElems);
                if (fReportErrors) {
                    String ename = elem.rawname;
                    fErrorReporter.reportWarning("HTML2002", new Object[]{ename,pname});
                }
                final QName qname = new QName(null, pname, pname, null);
                final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs());
                if (!parentCreated) {
                  if (!isForcedCreation) {
                    notifyDiscardedStartElement(elem, attrs, augs);
                  }
                return;
                }
            }
          else {
                if (preferedParent.code != HTMLElements.HEAD || (!fSeenBodyElement && !fDocumentFragment)) {
                    int depth = getParentDepth(element.parent, element.bounds);
                    if (depth == -1) { // no parent found
                        final String pname = modifyName(preferedParent.name, fNamesElems);
                        final QName qname = new QName(null, pname, pname, null);
                        if (fReportErrors) {
                            String ename = elem.rawname;
                            fErrorReporter.reportWarning("HTML2004", new Object[]{ename,pname});
                        }
                        final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs());
                        if (!parentCreated) {
                          if (!isForcedCreation) {
                            notifyDiscardedStartElement(elem, attrs, augs);
                          }
                        return;
                        }
                    }
                }
            }
        }

        // if block element, save immediate parent inline elements
        int depth = 0;
        if (element.flags == 0) {
            int length = fElementStack.top;
            fInlineStack.top = 0;
            for (int i = length - 1; i >= 0; i--) {
                Info info = fElementStack.data[i];
                if (!info.element.isInline()) {
                    break;
                }
                fInlineStack.push(info);
                endElement(info.qname, synthesizedAugs());
            }
            depth = fInlineStack.top;
        }

        // close previous elements
        // all elements close a <script>
        // in head, no element has children
        if ((fElementStack.top > 1
            && (fElementStack.peek().element.code == HTMLElements.SCRIPT))
            || fElementStack.top > 2 && fElementStack.data[fElementStack.top-2].element.code == HTMLElements.HEAD) {
            final Info info = fElementStack.pop();
            if (fDocumentHandler != null) {
                callEndElement(info.qname, synthesizedAugs());
            }
        }
        if (element.closes != null) {
            int length = fElementStack.top;
            for (int i = length - 1; i >= 0; i--) {
                Info info = fElementStack.data[i];

                // does it close the element we're looking at?
                if (element.closes(info.element.code)) {
                    if (fReportErrors) {
                        String ename = elem.rawname;
                        String iname = info.qname.rawname;
                        fErrorReporter.reportWarning("HTML2005", new Object[]{ename,iname});
                    }
                    for (int j = length - 1; j >= i; j--) {
                        info = fElementStack.pop();
                        if (fDocumentHandler != null) {
                            // PATCH: Marc-Andr� Morissette
                            callEndElement(info.qname, synthesizedAugs());
                        }
                    }
                    length = i;
                    continue;
                }
               
                // should we stop searching?
                if (info.element.isBlock() || element.isParent(info.element)) {
                  break;
                }
            }
        }
        // TODO: investigate if only table is special here
        // table closes all opened inline elements
        else if (elementCode == HTMLElements.TABLE) {
            for (int i=fElementStack.top-1; i >= 0; i--) {
                final Info info = fElementStack.data[i];
                if (!info.element.isInline()) {
                    break;
                }
                endElement(info.qname, synthesizedAugs());
            }
        }

        // call handler
        fSeenRootElement = true;
        if (element != null && element.isEmpty()) {
            if (attrs == null) {
                attrs = emptyAttributes();
            }
            if (fDocumentHandler != null) {
                fDocumentHandler.emptyElement(elem, attrs, augs);
            }
        }
        else {
            boolean inline = element != null && element.isInline();
            fElementStack.push(new Info(element, elem, inline ? attrs : null));
            if (attrs == null) {
                attrs = emptyAttributes();
            }
            if (fDocumentHandler != null) {
                callStartElement(elem, attrs, augs);
            }
        }

        // re-open inline elements
        for (int i = 0; i < depth; i++) {
            Info info = fInlineStack.pop();
            forceStartElement(info.qname, info.attributes, synthesizedAugs());
        }

        if (elementCode == HTMLElements.BODY) {
          lostText_.refeed(this);
        }
    } // startElement(QName,XMLAttributes,Augmentations)

    /**
     * Forces an element start, taking care to set the information to allow startElement to "see" that's
     * the element has been forced.
     * @return <code>true</code> if creation could be done (TABLE's creation for instance can't be forced)
     */
    private boolean forceStartElement(final QName elem, XMLAttributes attrs, final Augmentations augs)
    throws XNIException {
     
      forcedStartElement_ = true;
      startElement(elem, attrs, augs);
     
      return fElementStack.top > 0 && elem.equals(fElementStack.peek().qname);
    }

    private QName createQName(String tagName) {
    tagName = modifyName(tagName, fNamesElems);
    return new QName(null, tagName, tagName, NamespaceBinder.XHTML_1_0_URI);
  }

  /** Empty element. */
    public void emptyElement(final QName element, XMLAttributes attrs, Augmentations augs)
        throws XNIException {
      startElement(element, attrs, augs);
        // browser ignore the closing indication for non empty tags like <form .../> but not for unknown element
        final HTMLElements.Element elem = getElement(element);
        if (elem.isEmpty()
            || elem.code == HTMLElements.UNKNOWN
            || (elem.code == HTMLElements.IFRAME && fAllowSelfclosingIframe)) {
          endElement(element, augs);
        }
    } // emptyElement(QName,XMLAttributes,Augmentations)

  /** Start entity. */
    public void startGeneralEntity(String name,
                                   XMLResourceIdentifier id,
                                   String encoding,
                                   Augmentations augs) throws XNIException {
        fSeenAnything = true;

        // check for end of document
        if (fSeenRootElementEnd) {
            return;
        }

        // insert body, if needed
        if (!fDocumentFragment) {
            boolean insertBody = !fSeenRootElement;
            if (!insertBody) {
                Info info = fElementStack.peek();
                if (info.element.code == HTMLElements.HEAD ||
                    info.element.code == HTMLElements.HTML) {
                    String hname = modifyName("head", fNamesElems);
                    String bname = modifyName("body", fNamesElems);
                    if (fReportErrors) {
                        fErrorReporter.reportWarning("HTML2009", new Object[]{hname,bname});
                    }
                    fQName.setValues(null, hname, hname, null);
                    endElement(fQName, synthesizedAugs());
                    insertBody = true;
                }
            }
            if (insertBody) {
                forceStartBody();
            }
        }
       
        // call handler
        if (fDocumentHandler != null) {
            fDocumentHandler.startGeneralEntity(name, id, encoding, augs);
        }

    } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)

    /**
     * Generates a missing <body> (which creates missing <head> when needed)
     */
  private void forceStartBody() {
    final QName body = createQName("body");
    if (fReportErrors) {
        fErrorReporter.reportWarning("HTML2006", new Object[]{body.localpart});
    }
    forceStartElement(body, null, synthesizedAugs());
  }

    /** Text declaration. */
    public void textDecl(String version, String encoding, Augmentations augs)
        throws XNIException {
        fSeenAnything = true;
       
        // check for end of document
        if (fSeenRootElementEnd) {
            return;
        }

        // call handler
        if (fDocumentHandler != null) {
            fDocumentHandler.textDecl(version, encoding, augs);
        }

    } // textDecl(String,String,Augmentations)

    /** End entity. */
    public void endGeneralEntity(String name, Augmentations augs) throws XNIException {
       
        // check for end of document
        if (fSeenRootElementEnd) {
            return;
        }

        // call handler
        if (fDocumentHandler != null) {
            fDocumentHandler.endGeneralEntity(name, augs);
        }

    } // endGeneralEntity(String,Augmentations)

    /** Start CDATA section. */
    public void startCDATA(Augmentations augs) throws XNIException {
        fSeenAnything = true;
       
        consumeEarlyTextIfNeeded();

        // check for end of document
        if (fSeenRootElementEnd) {
            return;
        }

        // call handler
        if (fDocumentHandler != null) {
            fDocumentHandler.startCDATA(augs);
        }

    } // startCDATA(Augmentations)

    /** End CDATA section. */
    public void endCDATA(Augmentations augs) throws XNIException {

        // check for end of document
        if (fSeenRootElementEnd) {
            return;
        }

        // call handler
        if (fDocumentHandler != null) {
            fDocumentHandler.endCDATA(augs);
        }

    } // endCDATA(Augmentations)

    /** Characters. */
    public void characters(final XMLString text, final Augmentations augs) throws XNIException {
        // check for end of document
        if (fSeenRootElementEnd) {
            return;
        }

      if (fElementStack.top == 0 && !fDocumentFragment) {
        // character before first opening tag
        lostText_.add(text, augs);
        return;
      }

        // is this text whitespace?
        boolean whitespace = true;
        for (int i = 0; i < text.length; i++) {
            if (!Character.isWhitespace(text.ch[text.offset + i])) {
                whitespace = false;
                break;
            }
        }

        if (!fDocumentFragment) {
            // handle bare characters
            if (!fSeenRootElement) {
                if (whitespace) {
                    return;
                }
                forceStartBody();
            }
           
            if (whitespace && (fElementStack.top < 2 || endElementsBuffer_.size() == 1)) {
              // ignore spaces directly within <html>
              return;
            }

            // handle character content in head
            // NOTE: This frequently happens when the document looks like:
            //       <title>Title</title>
            //       And here's some text.
            else if (!whitespace) {
                Info info = fElementStack.peek();
                if (info.element.code == HTMLElements.HEAD ||
                    info.element.code == HTMLElements.HTML) {
                    String hname = modifyName("head", fNamesElems);
                    String bname = modifyName("body", fNamesElems);
                    if (fReportErrors) {
                        fErrorReporter.reportWarning("HTML2009", new Object[]{hname,bname});
                    }
                    forceStartBody();
                }
            }
        }

        // call handler
        if (fDocumentHandler != null) {
            fDocumentHandler.characters(text, augs);
        }

    } // characters(XMLString,Augmentations)

    /** Ignorable whitespace. */
    public void ignorableWhitespace(XMLString text, Augmentations augs)
        throws XNIException {
        characters(text, augs);
    } // ignorableWhitespace(XMLString,Augmentations)
   
    /** End element. */
    public void endElement(final QName element, final Augmentations augs) throws XNIException {
      final boolean forcedEndElement = forcedEndElement_;
        // is there anything to do?
        if (fSeenRootElementEnd) {
          notifyDiscardedEndElement(element, augs);
            return;
        }
       
        // get element information
        HTMLElements.Element elem = getElement(element);

        // if we consider outside content, just buffer </body> and </html> to consider them at the very end
        if (!fIgnoreOutsideContent &&
            (elem.code == HTMLElements.BODY || elem.code == HTMLElements.HTML)) {
          endElementsBuffer_.add(new ElementEntry(element, augs));
            return;
        }

        // check for end of document
        if (elem.code == HTMLElements.HTML) {
            fSeenRootElementEnd = true;
        }
        else if (elem.code == HTMLElements.FORM) {
          fOpenedForm = false;
        }
        else if (elem.code == HTMLElements.HEAD && !forcedEndElement) {
          // consume </head> first when <body> is reached to retrieve content lost between </head> and <body>
          endElementsBuffer_.add(new ElementEntry(element, augs));
          return;
        }

        // empty element
        int depth = getElementDepth(elem);
        if (depth == -1) {
          if (elem.code == HTMLElements.P) {
            forceStartElement(element, emptyAttributes(), synthesizedAugs());
              endElement(element, augs);
          }
          else if (!elem.isEmpty()) {
              notifyDiscardedEndElement(element, augs);
          }
            return;
        }

        // find unbalanced inline elements
        if (depth > 1 && elem.isInline()) {
            final int size = fElementStack.top;
            fInlineStack.top = 0;
            for (int i = 0; i < depth - 1; i++) {
                final Info info = fElementStack.data[size - i - 1];
                final HTMLElements.Element pelem = info.element;
                if (pelem.isInline() || pelem.code == HTMLElements.FONT) { // TODO: investigate if only FONT
                    // NOTE: I don't have to make a copy of the info because
                    //       it will just be popped off of the element stack
                    //       as soon as we close it, anyway.
                    fInlineStack.push(info);
                }
            }
        }

        // close children up to appropriate element
        for (int i = 0; i < depth; i++) {
            Info info = fElementStack.pop();
            if (fReportErrors && i < depth - 1) {
                String ename = modifyName(element.rawname, fNamesElems);
                String iname = info.qname.rawname;
                fErrorReporter.reportWarning("HTML2007", new Object[]{ename,iname});
            }
            if (fDocumentHandler != null) {
                // PATCH: Marc-Andr� Morissette
                callEndElement(info.qname, i < depth - 1 ? synthesizedAugs() : augs);
            }
        }

        // re-open inline elements
        if (depth > 1) {
            int size = fInlineStack.top;
            for (int i = 0; i < size; i++) {
                Info info = (Info)fInlineStack.pop();
                XMLAttributes attributes = info.attributes;
                if (fReportErrors) {
                    String iname = info.qname.rawname;
                    fErrorReporter.reportWarning("HTML2008", new Object[]{iname});
                }
                forceStartElement(info.qname, attributes, synthesizedAugs());
            }
        }

    } // endElement(QName,Augmentations)

    // @since Xerces 2.1.0

  /** Sets the document source. */
    public void setDocumentSource(XMLDocumentSource source) {
        fDocumentSource = source;
    } // setDocumentSource(XMLDocumentSource)

    /** Returns the document source. */
    public XMLDocumentSource getDocumentSource() {
        return fDocumentSource;
    } // getDocumentSource():XMLDocumentSource

    // removed since Xerces-J 2.3.0

    /** Start document. */
    public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
        throws XNIException {
        startDocument(locator, encoding, null, augs);
    } // startDocument(XMLLocator,String,Augmentations)

    /** Start prefix mapping. */
    public void startPrefixMapping(String prefix, String uri, Augmentations augs)
        throws XNIException {
       
        // check for end of document
        if (fSeenRootElementEnd) {
            return;
        }

        // call handler
        if (fDocumentHandler != null) {
          XercesBridge.getInstance().XMLDocumentHandler_startPrefixMapping(fDocumentHandler, prefix, uri, augs);
        }
   
    } // startPrefixMapping(String,String,Augmentations)

    /** End prefix mapping. */
    public void endPrefixMapping(String prefix, Augmentations augs)
        throws XNIException {
       
        // check for end of document
        if (fSeenRootElementEnd) {
            return;
        }

        // call handler
        if (fDocumentHandler != null) {
          XercesBridge.getInstance().XMLDocumentHandler_endPrefixMapping(fDocumentHandler, prefix, augs);
        }
   
    } // endPrefixMapping(String,Augmentations)

    //
    // Protected methods
    //

    /** Returns an HTML element. */
    protected HTMLElements.Element getElement(final QName elementName) {
      String name = elementName.rawname;
        if (fNamespaces && NamespaceBinder.XHTML_1_0_URI.equals(elementName.uri)) {
            int index = name.indexOf(':');
            if (index != -1) {
                name = name.substring(index+1);
            }
        }
        return HTMLElements.getElement(name);
    } // getElement(String):HTMLElements.Element

    /** Call document handler start element. */
    protected final void callStartElement(QName element, XMLAttributes attrs,
                                          Augmentations augs)
        throws XNIException {
        fDocumentHandler.startElement(element, attrs, augs);
    } // callStartElement(QName,XMLAttributes,Augmentations)

    /** Call document handler end element. */
    protected final void callEndElement(QName element, Augmentations augs)
        throws XNIException {
        fDocumentHandler.endElement(element, augs);
    } // callEndElement(QName,Augmentations)

    /**
     * Returns the depth of the open tag associated with the specified
     * element name or -1 if no matching element is found.
     *
     * @param element The element.
     */
    protected final int getElementDepth(HTMLElements.Element element) {
        final boolean container = element.isContainer();
        final short elementCode = element.code;
        final boolean tableBodyOrHtml = (elementCode == HTMLElements.TABLE)
      || (elementCode == HTMLElements.BODY) || (elementCode == HTMLElements.HTML);
        int depth = -1;
        for (int i = fElementStack.top - 1; i >=fragmentContextStackSize_; i--) {
            Info info = fElementStack.data[i];
            if (info.element.code == element.code) {
                depth = fElementStack.top - i;
                break;
            }
            if (!container && info.element.isBlock()) {
                break;
            }
            if (info.element.code == HTMLElements.TABLE && !tableBodyOrHtml) {
              return -1; // current element not allowed to close a table
            }
        }
        return depth;
    } // getElementDepth(HTMLElements.Element)

    /**
     * Returns the depth of the open tag associated with the specified
     * element parent names or -1 if no matching element is found.
     *
     * @param parents The parent elements.
     */
    protected int getParentDepth(HTMLElements.Element[] parents, short bounds) {
        if (parents != null) {
            for (int i = fElementStack.top - 1; i >= 0; i--) {
                Info info = fElementStack.data[i];
                if (info.element.code == bounds) {
                    break;
                }
                for (int j = 0; j < parents.length; j++) {
                    if (info.element.code == parents[j].code) {
                        return fElementStack.top - i;
                    }
                }
            }
        }
        return -1;
    } // getParentDepth(HTMLElements.Element[],short):int

    /** Returns a set of empty attributes. */
    protected final XMLAttributes emptyAttributes() {
        fEmptyAttrs.removeAllAttributes();
        return fEmptyAttrs;
    } // emptyAttributes():XMLAttributes

    /** Returns an augmentations object with a synthesized item added. */
    protected final Augmentations synthesizedAugs() {
        HTMLAugmentations augs = null;
        if (fAugmentations) {
            augs = fInfosetAugs;
            augs.removeAllItems();
            augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
        }
        return augs;
    } // synthesizedAugs():Augmentations

    //
    // Protected static methods
    //

    /** Modifies the given name based on the specified mode. */
    protected static final String modifyName(String name, short mode) {
        switch (mode) {
            case NAMES_UPPERCASE: return name.toUpperCase();
            case NAMES_LOWERCASE: return name.toLowerCase();
        }
        return name;
    } // modifyName(String,short):String

    /**
     * Converts HTML names string value to constant value.
     *
     * @see #NAMES_NO_CHANGE
     * @see #NAMES_LOWERCASE
     * @see #NAMES_UPPERCASE
     */
    protected static final short getNamesValue(String value) {
        if (value.equals("lower")) {
            return NAMES_LOWERCASE;
        }
        if (value.equals("upper")) {
            return NAMES_UPPERCASE;
        }
        return NAMES_NO_CHANGE;
    } // getNamesValue(String):short

    //
    // Classes
    //

    /**
     * Element info for each start element. This information is used when
     * closing unbalanced inline elements. For example:
     * <pre>
     * &lt;i>unbalanced &lt;b>HTML&lt;/i> content&lt;/b>
     * </pre>
     * <p>
     * It seems that it is a waste of processing and memory to copy the
     * attributes for every start element even if there are no unbalanced
     * inline elements in the document. However, if the attributes are
     * <em>not</em> saved, then important attributes such as style
     * information would be lost.
     *
     * @author Andy Clark
     */
    public static class Info {

        //
        // Data
        //

        /** The element. */
        public HTMLElements.Element element;

        /** The element qualified name. */
        public QName qname;

        /** The element attributes. */
        public XMLAttributes attributes;

        //
        // Constructors
        //

        /**
         * Creates an element information object.
         * <p>
         * <strong>Note:</strong>
         * This constructor makes a copy of the element information.
         *
         * @param element The element qualified name.
         */
        public Info(HTMLElements.Element element, QName qname) {
            this(element, qname, null);
        } // <init>(HTMLElements.Element,QName)

        /**
         * Creates an element information object.
         * <p>
         * <strong>Note:</strong>
         * This constructor makes a copy of the element information.
         *
         * @param element The element qualified name.
         * @param attributes The element attributes.
         */
        public Info(HTMLElements.Element element,
                    QName qname, XMLAttributes attributes) {
            this.element = element;
            this.qname = new QName(qname);
            if (attributes != null) {
                int length = attributes.getLength();
                if (length > 0) {
                    QName aqname = new QName();
                    XMLAttributes newattrs = new XMLAttributesImpl();
                    for (int i = 0; i < length; i++) {
                        attributes.getName(i, aqname);
                        String type = attributes.getType(i);
                        String value = attributes.getValue(i);
                        String nonNormalizedValue = attributes.getNonNormalizedValue(i);
                        boolean specified = attributes.isSpecified(i);
                        newattrs.addAttribute(aqname, type, value);
                        newattrs.setNonNormalizedValue(i, nonNormalizedValue);
                        newattrs.setSpecified(i, specified);
                    }
                    this.attributes = newattrs;
                }
            }
        } // <init>(HTMLElements.Element,QName,XMLAttributes)

        /**
         * Simple representation to make debugging easier
         */
        public String toString() {
          return super.toString() + qname;
        }
    } // class Info

    /** Unsynchronized stack of element information. */
    public static class InfoStack {

        //
        // Data
        //

        /** The top of the stack. */
        public int top;

        /** The stack data. */
        public Info[] data = new Info[10];

        //
        // Public methods
        //

        /** Pushes element information onto the stack. */
        public void push(Info info) {
            if (top == data.length) {
                Info[] newarray = new Info[top + 10];
                System.arraycopy(data, 0, newarray, 0, top);
                data = newarray;
            }
            data[top++] = info;
        } // push(Info)

        /** Peeks at the top of the stack. */
        public Info peek() {
            return data[top-1];
        } // peek():Info

        /** Pops the top item off of the stack. */
        public Info pop() {
            return data[--top];
        } // pop():Info
       
        /**
         * Simple representation to make debugging easier
         */
        public String toString() {
          final StringBuffer sb = new StringBuffer("InfoStack(");
          for (int i=top-1; i>=0; --i) {
            sb.append(data[i]);
            if (i != 0)
              sb.append(", ");
          }
          sb.append(")");
          return sb.toString();
        }


    } // class InfoStack

  void setTagBalancingListener(final HTMLTagBalancingListener tagBalancingListener) {
    this.tagBalancingListener = tagBalancingListener;
  }

  /**
   * Notifies the tagBalancingListener (if any) of an ignored start element
   */
    private void notifyDiscardedStartElement(final QName elem, final XMLAttributes attrs,
        final Augmentations augs) {
      if (tagBalancingListener != null)
        tagBalancingListener.ignoredStartElement(elem, attrs, augs);
  }

  /**
   * Notifies the tagBalancingListener (if any) of an ignored end element
   */
    private void notifyDiscardedEndElement(final QName element, final Augmentations augs) {
      if (tagBalancingListener != null)
        tagBalancingListener.ignoredEndElement(element, augs);
  }

    /**
     * Structure to hold information about an element placed in buffer to be comsumed later
     */
    static class ElementEntry {
      private final QName name_;
      private final Augmentations augs_;
      ElementEntry(final QName element, final Augmentations augs) {
        name_ = new QName(element);
        augs_ = (augs == null) ? null : new HTMLAugmentations(augs);
      }
    }
} // class HTMLTagBalancer
TOP

Related Classes of org.cyberneko.html.HTMLTagBalancer$ElementEntry

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.