Package org.ccil.cowan.tagsoup

Source Code of org.ccil.cowan.tagsoup.Parser

// This file is part of TagSoup.
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.  You may also distribute
// and/or modify it under version 3.0 of the Academic Free License.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
//
// The TagSoup parser

package org.ccil.cowan.tagsoup;
import ch.randelshofer.scorm.TagSoupAutoDetector;
import java.util.HashMap;
import java.util.ArrayList;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.ext.LexicalHandler;


/**
* The SAX parser class.
**/
public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler {
   
    // XMLReader implementation
   
    private ContentHandler theContentHandler = this;
    private LexicalHandler theLexicalHandler = this;
    private DTDHandler theDTDHandler = this;
    private ErrorHandler theErrorHandler = this;
    private EntityResolver theEntityResolver = this;
    private Schema theSchema;
    private Scanner theScanner;
    private AutoDetector theAutoDetector;
    // Feature flags
    private boolean namespaces = true;
    private boolean ignoreBogons = false;
    private boolean bogonsEmpty = true;
    private boolean defaultAttributes = true;
    private boolean translateColons = false;
    private boolean restartElements = true;
    private boolean ignorableWhitespace = false;
    private boolean CDATAElements = true;
   
    /**
     * A value of "true" indicates namespace URIs and unprefixed local
     * names for element and attribute names will be available.
     **/
    public final static String namespacesFeature =
            "http://xml.org/sax/features/namespaces";
   
    /**
     * A value of "true" indicates that XML qualified names (with prefixes)
     * and attributes (including xmlns* attributes) will be available.
     * We don't support this value.
     **/
    public final static String namespacePrefixesFeature =
            "http://xml.org/sax/features/namespace-prefixes";
   
    /**
     * Reports whether this parser processes external general entities
     * (it doesn't).
     **/
    public final static String externalGeneralEntitiesFeature =
            "http://xml.org/sax/features/external-general-entities";
   
    /**
     * Reports whether this parser processes external parameter entities
     * (it doesn't).
     **/
    public final static String externalParameterEntitiesFeature =
            "http://xml.org/sax/features/external-parameter-entities";
   
    /**
     * May be examined only during a parse, after the startDocument()
     * callback has been completed; read-only. The value is true if
     * the document specified standalone="yes" in its XML declaration,
     * and otherwise is false.  (It's always false.)
     **/
    public final static String isStandaloneFeature =
            "http://xml.org/sax/features/is-standalone";
   
    /**
     * A value of "true" indicates that the LexicalHandler will report
     * the beginning and end of parameter entities (it won't).
     **/
    public final static String lexicalHandlerParameterEntitiesFeature =
            "http://xml.org/sax/features/lexical-handler/parameter-entities";
   
    /**
     * A value of "true" indicates that system IDs in declarations will
     * be absolutized (relative to their base URIs) before reporting.
     * (This returns true but doesn't actually do anything.)
     **/
    public final static String resolveDTDURIsFeature =
            "http://xml.org/sax/features/resolve-dtd-uris";
   
    /**
     * Has a value of "true" if all XML names (for elements,
     * prefixes, attributes, entities, notations, and local
     * names), as well as Namespace URIs, will have been interned
     * using java.lang.String.intern. This supports fast testing of
     * equality/inequality against string constants, rather than forcing
     * slower calls to String.equals().  (We always intern.)
     **/
    public final static String stringInterningFeature =
            "http://xml.org/sax/features/string-interning";
   
    /**
     * Returns "true" if the Attributes objects passed by this
     * parser in ContentHandler.startElement() implement the
     * org.xml.sax.ext.Attributes2 interface.  (They don't.)
     **/
   
    public final static String useAttributes2Feature =
            "http://xml.org/sax/features/use-attributes2";
   
    /**
     * Returns "true" if the Locator objects passed by this parser
     * in ContentHandler.setDocumentLocator() implement the
     * org.xml.sax.ext.Locator2 interface.  (They don't.)
     **/
    public final static String useLocator2Feature =
            "http://xml.org/sax/features/use-locator2";
   
    /**
     * Returns "true" if, when setEntityResolver is given an object
     * implementing the org.xml.sax.ext.EntityResolver2 interface,
     * those new methods will be used.  (They won't be.)
     **/
    public final static String useEntityResolver2Feature =
            "http://xml.org/sax/features/use-entity-resolver2";
   
    /**
     * Controls whether the parser is reporting all validity errors
     * (We don't report any validity errors.)
     **/
    public final static String validationFeature =
            "http://xml.org/sax/features/validation";
   
    /**
     * Controls whether the parser reports Unicode normalization
     * errors as described in section 2.13 and Appendix B of the XML
     * 1.1 Recommendation.  (We don't normalize.)
     **/
    public final static String unicodeNormalizationCheckingFeature =
            "http://xml.org/sax/features/unicode-normalization-checking";
   
    /**
     * Controls whether, when the namespace-prefixes feature is set,
     * the parser treats namespace declaration attributes as being in
     * the http://www.w3.org/2000/xmlns/ namespace.  (It doesn't.)
     **/
    public final static String xmlnsURIsFeature =
            "http://xml.org/sax/features/xmlns-uris";
   
    /**
     * Returns "true" if the parser supports both XML 1.1 and XML 1.0.
     * (Always false.)
     **/
    public final static String XML11Feature =
            "http://xml.org/sax/features/xml-1.1";
   
    /**
     * A value of "true" indicates that the parser will ignore
     * unknown elements.
     **/
    public final static String ignoreBogonsFeature =
            "http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons";
   
    /**
     * A value of "true" indicates that the parser will give unknown
     * elements a content model of EMPTY; a value of "false", a
     * content model of ANY.
     **/
    public final static String bogonsEmptyFeature =
            "http://www.ccil.org/~cowan/tagsoup/features/bogons-empty";
   
    /**
     * A value of "true" indicates that the parser will return default
     * attribute values for missing attributes that have default values.
     **/
    public final static String defaultAttributesFeature =
            "http://www.ccil.org/~cowan/tagsoup/features/default-attributes";
   
    /**
     * A value of "true" indicates that the parser will
     * translate colons into underscores in names.
     **/
    public final static String translateColonsFeature =
            "http://www.ccil.org/~cowan/tagsoup/features/translate-colons";
   
    /**
     * A value of "true" indicates that the parser will
     * attempt to restart the restartable elements.
     **/
    public final static String restartElementsFeature =
            "http://www.ccil.org/~cowan/tagsoup/features/restart-elements";
   
    /**
     * A value of "true" indicates that the parser will
     * transmit whitespace in element-only content via the SAX
     * ignorableWhitespace callback.  Normally this is not done,
     * because HTML is an SGML application and SGML suppresses
     * such whitespace.
     **/
    public final static String ignorableWhitespaceFeature =
            "http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace";
   
    /**
     * A value of "true" indicates that the parser will treat CDATA
     * elements specially.  Normally true, since the input is by
     * default HTML.
     **/
    public final static String CDATAElementsFeature =
            "http://www.ccil.org/~cowan/tagsoup/features/cdata-elements";
   
    /**
     * Used to see some syntax events that are essential in some
     * applications: comments, CDATA delimiters, selected general
     * entity inclusions, and the start and end of the DTD (and
     * declaration of document element name). The Object must implement
     * org.xml.sax.ext.LexicalHandler.
     **/
    public final static String lexicalHandlerProperty =
            "http://xml.org/sax/properties/lexical-handler";
   
    /**
     * Specifies the Scanner object this Parser uses.
     **/
    public final static String scannerProperty =
            "http://www.ccil.org/~cowan/tagsoup/properties/scanner";
   
    /**
     * Specifies the Schema object this Parser uses.
     **/
    public final static String schemaProperty =
            "http://www.ccil.org/~cowan/tagsoup/properties/schema";
   
    /**
     * Specifies the AutoDetector (for encoding detection) this Parser uses.
     **/
    public final static String autoDetectorProperty =
            "http://www.ccil.org/~cowan/tagsoup/properties/auto-detector";
   
    private HashMap theFeatures = new HashMap();
    {
        theFeatures.put(namespacesFeature, Boolean.TRUE);
        theFeatures.put(namespacePrefixesFeature, Boolean.FALSE);
        theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE);
        theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE);
        theFeatures.put(isStandaloneFeature, Boolean.FALSE);
        theFeatures.put(lexicalHandlerParameterEntitiesFeature,
                Boolean.FALSE);
        theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE);
        theFeatures.put(stringInterningFeature, Boolean.TRUE);
        theFeatures.put(useAttributes2Feature, Boolean.FALSE);
        theFeatures.put(useLocator2Feature, Boolean.FALSE);
        theFeatures.put(useEntityResolver2Feature, Boolean.FALSE);
        theFeatures.put(validationFeature, Boolean.FALSE);
        theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
        theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
        theFeatures.put(XML11Feature, Boolean.FALSE);
        theFeatures.put(ignoreBogonsFeature, Boolean.FALSE);
        theFeatures.put(bogonsEmptyFeature, Boolean.TRUE);
        theFeatures.put(defaultAttributesFeature, Boolean.TRUE);
        theFeatures.put(translateColonsFeature, Boolean.FALSE);
        theFeatures.put(restartElementsFeature, Boolean.TRUE);
        theFeatures.put(ignorableWhitespaceFeature, Boolean.FALSE);
        theFeatures.put(CDATAElementsFeature, Boolean.TRUE);
    }
   
   
    public boolean getFeature(String name)
    throws SAXNotRecognizedException, SAXNotSupportedException {
        Boolean b = (Boolean)theFeatures.get(name);
        if (b == null) {
            throw new SAXNotRecognizedException("Unknown feature " + name);
        }
        return b.booleanValue();
    }
   
    public void setFeature(String name, boolean value)
    throws SAXNotRecognizedException, SAXNotSupportedException {
        Boolean b = (Boolean)theFeatures.get(name);
        if (b == null) {
            throw new SAXNotRecognizedException("Unknown feature " + name);
        }
        if (value) theFeatures.put(name, Boolean.TRUE);
        else theFeatures.put(name, Boolean.FALSE);
       
        if (name.equals(namespacesFeature)) namespaces = value;
        else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value;
        else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value;
        else if (name.equals(defaultAttributesFeature)) defaultAttributes = value;
        else if (name.equals(translateColonsFeature)) translateColons = value;
        else if (name.equals(restartElementsFeature)) restartElements = value;
        else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value;
        else if (name.equals(CDATAElementsFeature)) CDATAElements = value;
    }
   
    public Object getProperty(String name)
    throws SAXNotRecognizedException, SAXNotSupportedException {
        if (name.equals(lexicalHandlerProperty)) {
            return theLexicalHandler == this ? null : theLexicalHandler;
        } else if (name.equals(scannerProperty)) {
            return theScanner;
        } else if (name.equals(schemaProperty)) {
            return theSchema;
        } else if (name.equals(autoDetectorProperty)) {
            return theAutoDetector;
        } else {
            throw new SAXNotRecognizedException("Unknown property " + name);
        }
    }
   
    public void setProperty(String name, Object value)
    throws SAXNotRecognizedException, SAXNotSupportedException {
        if (name.equals(lexicalHandlerProperty)) {
            if (value == null) {
                theLexicalHandler = this;
            } else if (value instanceof LexicalHandler) {
                theLexicalHandler = (LexicalHandler)value;
            } else {
                throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler");
            }
        } else if (name.equals(scannerProperty)) {
            if (value instanceof Scanner) {
                theScanner = (Scanner)value;
            } else {
                throw new SAXNotSupportedException("Your scanner is not a Scanner");
            }
        } else if (name.equals(schemaProperty)) {
            if (value instanceof Schema) {
                theSchema = (Schema)value;
            } else {
                throw new SAXNotSupportedException("Your schema is not a Schema");
            }
        } else if (name.equals(autoDetectorProperty)) {
            if (value instanceof AutoDetector) {
                theAutoDetector = (AutoDetector)value;
            } else {
                throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector");
            }
        } else {
            throw new SAXNotRecognizedException("Unknown property " + name);
        }
    }
   
    public void setEntityResolver(EntityResolver resolver) {
        theEntityResolver = (resolver == null) ? this : resolver;
    }
   
    public EntityResolver getEntityResolver() {
        return (theEntityResolver == this) ? null : theEntityResolver;
    }
   
    public void setDTDHandler(DTDHandler handler) {
        theDTDHandler = (handler == null) ? this : handler;
    }
   
    public DTDHandler getDTDHandler() {
        return (theDTDHandler == this) ? null : theDTDHandler;
    }
   
    public void setContentHandler(ContentHandler handler) {
        theContentHandler = (handler == null) ? this : handler;
    }
   
    public ContentHandler getContentHandler() {
        return (theContentHandler == this) ? null : theContentHandler;
    }
   
    public void setErrorHandler(ErrorHandler handler) {
        theErrorHandler = (handler == null) ? this : handler;
    }
   
    public ErrorHandler getErrorHandler() {
        return (theErrorHandler == this) ? null : theErrorHandler;
    }
   
    public void parse(InputSource input) throws IOException, SAXException {
        setup();
        Reader r = getReader(input);
        theContentHandler.startDocument();
        theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId());
        if (theScanner instanceof Locator) {
            theContentHandler.setDocumentLocator((Locator)theScanner);
        }
        if (!(theSchema.getURI().equals("")))
            theContentHandler.startPrefixMapping(theSchema.getPrefix(),
                    theSchema.getURI());
        theScanner.scan(r, this);
    }
   
    public void parse(String systemid) throws IOException, SAXException {
        parse(new InputSource(systemid));
    }
   
    // Sets up instance variables that haven't been set by setFeature
    private void setup() {
        if (theSchema == null) theSchema = new HTMLSchema();
        if (theScanner == null) theScanner = new HTMLScanner();
        if (theAutoDetector == null) {
            // BEGIN PATCH Setup an AutoDetector which actually does something
                        /*
                        theAutoDetector = new AutoDetector() {
                                public Reader autoDetectingReader(InputStream i) {
                                        return new InputStreamReader(i);
                                        }
                                };
                         */
            theAutoDetector = new TagSoupAutoDetector();
            // END PATCH Setup an AutoDetector which actually does something
        }
        theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes);
        thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes);
        theNewElement = null;
        theAttributeName = null;
        thePITarget = null;
        theSaved = null;
        theEntity = 0;
        virginStack = true;
        doctypename = doctypepublicid = doctypesystemid = null;
    }
   
    // Return a Reader based on the contents of an InputSource
    // Buffer both the InputStream and the Reader
    private Reader getReader(InputSource s) throws SAXException, IOException {
        Reader r = s.getCharacterStream();
        InputStream i = s.getByteStream();
        String encoding = s.getEncoding();
        String publicid = s.getPublicId();
        String systemid = s.getSystemId();
        if (r == null) {
            if (i == null) i = getInputStream(publicid, systemid);
//      i = new BufferedInputStream(i);
            if (encoding == null) {
                r = theAutoDetector.autoDetectingReader(i);
            } else {
                try {
                    r = new InputStreamReader(i, encoding);
                } catch (UnsupportedEncodingException e) {
                    r = new InputStreamReader(i);
                }
            }
        }
//    r = new BufferedReader(r);
        return r;
    }
   
    // Get an InputStream based on a publicid and a systemid
    private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException {
        URL basis = new URL("file", "", System.getProperty("user.dir") + "/.");
        URL url = new URL(basis, systemid);
        URLConnection c = url.openConnection();
        return c.getInputStream();
    }
    // We don't process publicids (who uses them anyhow?)
   
    // ScanHandler implementation
   
    private Element theNewElement = null;
    private String theAttributeName = null;
    private String doctypepublicid = null;
    private String doctypesystemid = null;
    private String doctypename = null;
    private String thePITarget = null;
    private Element theStack = null;
    private Element theSaved = null;
    private Element thePCDATA = null;
    private char theEntity = 0;
   
    public void adup(char[] buff, int offset, int length) throws SAXException {
        if (theNewElement == null || theAttributeName == null) return;
        theNewElement.setAttribute(theAttributeName, null, theAttributeName);
        theAttributeName = null;
    }
   
    public void aname(char[] buff, int offset, int length) throws SAXException {
        if (theNewElement == null) return;
        theAttributeName = makeName(buff, offset, length);
//    System.err.println("%% Attribute name " + theAttributeName);
    }
   
    public void aval(char[] buff, int offset, int length) throws SAXException {
        if (theNewElement == null || theAttributeName == null) return;
        String value = new String(buff, offset, length);
//    System.err.println("%% Attribute value [" + value + "]");
        theNewElement.setAttribute(theAttributeName, null, value);
        theAttributeName = null;
//    System.err.println("%% Aval done");
    }
   
    public void entity(char[] buff, int offset, int length) throws SAXException {
        if (length < 1) {
            theEntity = 0;
            return;
        }
//    System.err.println("%% Entity at " + offset + " " + length);
        String name = new String(buff, offset, length);
//    System.err.println("%% Got entity [" + name + "]");
        theEntity = theSchema.getEntity(name);
    }
   
    public void eof(char[] buff, int offset, int length) throws SAXException {
        if (virginStack) rectify(thePCDATA);
        while (theStack.next() != null) {
            pop();
        }
        if (!(theSchema.getURI().equals("")))
            theContentHandler.endPrefixMapping(theSchema.getPrefix());
        theContentHandler.endDocument();
    }
   
    public void etag(char[] buff, int offset, int length) throws SAXException {
        if (etag_cdata(buff, offset, length)) return;
        etag_basic(buff, offset, length);
    }
   
    private static char[] etagchars = {'<', '/', '>'};
    public boolean etag_cdata(char[] buff, int offset, int length) throws SAXException {
        String currentName = theStack.name();
        // If this is a CDATA element and the tag doesn't match,
        // or isn't properly formed (junk after the name),
        // restart CDATA mode and process the tag as characters.
        if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
            boolean realTag = (length == currentName.length());
            if (realTag) {
                for (int i = 0; i < length; i++) {
                    if (Character.toLowerCase(buff[offset + i]) != Character.toLowerCase(currentName.charAt(i))) {
                        realTag = false;
                        break;
                    }
                }
            }
            if (!realTag) {
                theContentHandler.characters(etagchars, 0, 2);
                theContentHandler.characters(buff, offset, length);
                theContentHandler.characters(etagchars, 2, 1);
                theScanner.startCDATA();
                return true;
            }
        }
        return false;
    }
   
    public void etag_basic(char[] buff, int offset, int length) throws SAXException {
        theNewElement = null;
        String name;
        if (length != 0) name = makeName(buff, offset, length);
        else name = theStack.name();
//    System.err.println("%% Got end of " + name);
       
        Element sp;
        boolean inNoforce = false;
        for (sp = theStack; sp != null; sp = sp.next()) {
            if (sp.name().equals(name)) break;
            if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true;
        }
       
        if (sp == null) return;    // Ignore unknown etags
        if (sp.next() == null || sp.next().next() == null) return;
        if (inNoforce) {    // inside an F_NOFORCE element?
            sp.preclose();    // preclose the matching element
        } else {      // restartably pop everything above us
            while (theStack != sp) {
                restartablyPop();
            }
            pop();
        }
        // pop any preclosed elements now at the top
        while (theStack.isPreclosed()) {
            pop();
        }
        restart(null);
    }
   
    // Push restartables on the stack if possible
    // e is the next element to be started, if we know what it is
    private void restart(Element e) throws SAXException {
        while (theSaved != null && theStack.canContain(theSaved) &&
                (e == null || theSaved.canContain(e))) {
            Element next = theSaved.next();
            push(theSaved);
            theSaved = next;
        }
    }
   
    // Pop the stack irrevocably
    private void pop() throws SAXException {
        if (theStack == null) return;    // empty stack
        String name = theStack.name();
        String localName = theStack.localName();
        String namespace = theStack.namespace();
//    System.err.println("%% Popping " + name);
        if (!namespaces) namespace = localName = "";
        theContentHandler.endElement(namespace, localName, name);
        theStack = theStack.next();
    }
   
    // Pop the stack restartably
    private void restartablyPop() throws SAXException {
        Element popped = theStack;
        pop();
        if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) {
            popped.anonymize();
            popped.setNext(theSaved);
            theSaved = popped;
        }
    }
   
    // Push element onto stack
    private boolean virginStack = true;
    private void push(Element e) throws SAXException {
        String name = e.name();
        String localName = e.localName();
        String namespace = e.namespace();
//    System.err.println("%% Pushing " + name);
        e.clean();
        if (!namespaces) namespace = localName = "";
        if (virginStack && localName.equalsIgnoreCase(doctypename)) {
            try {
                theEntityResolver.resolveEntity(doctypepublicid, doctypesystemid);
            } catch (IOException ew) { }   // Can't be thrown for root I believe.
        }
        theContentHandler.startElement(namespace, localName, name, e.atts());
        e.setNext(theStack);
        theStack = e;
        virginStack = false;
        if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
            theScanner.startCDATA();
        }
    }
   
    /**
     * Parsing the complete XML Document Type Definition is way too complex,
     * but for many simple cases we can extract something useful from it.
     *
     * doctypedecl  ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
     *  DeclSep     ::= PEReference | S
     *  intSubset   ::= (markupdecl | DeclSep)*
     *  markupdecl  ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
     *  ExternalID  ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
     */
    public void decl(char[] buff, int offset, int length) throws SAXException {
        String s = new String(buff, offset, length);
        String name = null;
        String systemid = null;
        String publicid = null;
        String[] v = split(s);
        if (v.length > 0 && "DOCTYPE".equals(v[0])) {
            if (v.length > 1) {
                name = v[1];
                if (v.length>3 && "SYSTEM".equals(v[2])) {
                    systemid = v[3];
                } else if (v.length > 3 && "PUBLIC".equals(v[2])) {
                    publicid = v[3];
                    if (v.length > 4) {
                        systemid = v[4];
                    } else {
                        systemid = "";
                    }
                }
            }
        }
        publicid = trimquotes(publicid);
        systemid = trimquotes(systemid);
        if (name != null) {
            publicid = cleanPublicid(publicid);
            theLexicalHandler.startDTD(name, publicid, systemid);
            theLexicalHandler.endDTD();
            doctypename = name;
            doctypepublicid = publicid;
            if (theScanner instanceof Locator) {    // Must resolve systemid
                doctypesystemid  = ((Locator)theScanner).getSystemId();
                try {
                    doctypesystemid = new URL(new URL(doctypesystemid), systemid).toString();
                } catch (Exception e) {}
            }
        }
    }
   
    // If the String is quoted, trim the quotes.
    private static String trimquotes(String in) {
        if (in == null) return in;
        int length = in.length();
        if (length == 0) return in;
        char s = in.charAt(0);
        char e = in.charAt(length - 1);
        if (s == e && (s == '\'' || s == '"')) {
            in = in.substring(1, in.length() - 1);
        }
        return in;
    }
   
    // Split the supplied String into words or phrases seperated by spaces.
    // Recognises quotes around a phrase and doesn't split it.
    private static String[] split(String val) throws IllegalArgumentException {
        val = val.trim();
        if (val.length() == 0) {
            return new String[0];
        } else {
            ArrayList l = new ArrayList();
            int s = 0;
            int e = 0;
            boolean sq = false// single quote
            boolean dq = false// double quote
            char lastc = 0;
            int len = val.length();
            for (e=0; e < len; e++) {
                char c = val.charAt(e);
                if (!dq && c == '\'' && lastc != '\\') {
                    sq = !sq;
                    if (s < 0) s = e;
                } else if (!sq && c == '\"' && lastc != '\\') {
                    dq = !dq;
                    if (s < 0) s = e;
                } else if (!sq && !dq) {
                    if (Character.isWhitespace(c)) {
                        if (s >= 0) l.add(val.substring(s, e));
                        s = -1;
                    } else if (s < 0 && c != ' ') {
                        s = e;
                    }
                }
                lastc = c;
            }
            l.add(val.substring(s, e));
            return (String[])l.toArray(new String[0]);
        }
    }
   
    // Replace junk in publicids with spaces
    private static String legal =
            "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%";
   
    private String cleanPublicid(String src) {
        if (src == null) return null;
        int len = src.length();
        StringBuffer dst = new StringBuffer(len);
        boolean suppressSpace = true;
        for (int i = 0; i < len; i++) {
            char ch = src.charAt(i);
            if (legal.indexOf(ch) != -1) {   // legal but not whitespace
                dst.append(ch);
                suppressSpace = false;
            } else if (suppressSpace) {  // normalizable whitespace or junk
                ;
            } else {
                dst.append(' ');
                suppressSpace = true;
            }
        }
//    System.err.println("%% Publicid [" + dst.toString().trim() + "]");
        return dst.toString().trim()// trim any final junk whitespace
    }
   
   
    public void gi(char[] buff, int offset, int length) throws SAXException {
        if (theNewElement != null) return;
        String name = makeName(buff, offset, length);
        if (name == null) return;
        ElementType type = theSchema.getElementType(name);
        if (type == null) {
            // Suppress unknown elements if ignore-bogons is on
            if (ignoreBogons) return;
            theSchema.elementType(name, bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY, Schema.M_ANY, 0);
            type = theSchema.getElementType(name);
        }
       
        theNewElement = new Element(type, defaultAttributes);
//    System.err.println("%% Got GI " + theNewElement.name());
    }
   
    public void cdsect(char[] buff, int offset, int length) throws SAXException {
        theLexicalHandler.startCDATA();
        pcdata(buff, offset, length);
        theLexicalHandler.endCDATA();
    }
    public void pcdata(char[] buff, int offset, int length) throws SAXException {
        if (length == 0) return;
        boolean allWhite = true;
        for (int i = 0; i < length; i++) {
            if (!Character.isWhitespace(buff[offset+i])) {
                allWhite = false;
            }
        }
        if (allWhite && !theStack.canContain(thePCDATA)) {
            if (ignorableWhitespace) {
                theContentHandler.ignorableWhitespace(buff, offset, length);
            }
        } else {
            rectify(thePCDATA);
            theContentHandler.characters(buff, offset, length);
        }
    }
   
    public void pitarget(char[] buff, int offset, int length) throws SAXException {
        if (theNewElement != null) return;
        thePITarget = makeName(buff, offset, length);
    }
   
    public void pi(char[] buff, int offset, int length) throws SAXException {
        if (theNewElement != null || thePITarget == null) return;
        if (thePITarget.toLowerCase().equals("xml")) return;
//    if (length > 0 && buff[length - 1] == '?') System.out.println("%% Removing ? from PI");
        if (length > 0 && buff[length - 1] == '?') length--;  // remove trailing ?
        theContentHandler.processingInstruction(thePITarget,
                new String(buff, offset, length));
        thePITarget = null;
    }
   
    public void stagc(char[] buff, int offset, int length) throws SAXException {
//    System.err.println("%% Start-tag");
        if (theNewElement == null) return;
        rectify(theNewElement);
        if (theStack.model() == Schema.M_EMPTY) {
            // Force an immediate end tag
            etag_basic(buff, offset, length);
        }
    }
   
    public void stage(char[] buff, int offset, int length) throws SAXException {
//    System.err.println("%% Empty-tag");
        if (theNewElement == null) return;
        rectify(theNewElement);
        // Force an immediate end tag
        etag_basic(buff, offset, length);
    }
   
    // Comment buffer is twice the size of the output buffer
    private char[] theCommentBuffer = new char[2000];
    public void cmnt(char[] buff, int offset, int length) throws SAXException {
        theLexicalHandler.comment(buff, offset, length);
    }
   
    // Rectify the stack, pushing and popping as needed
    // so that the argument can be safely pushed
    private void rectify(Element e) throws SAXException {
        Element sp;
        while (true) {
            for (sp = theStack; sp != null; sp = sp.next()) {
                if (sp.canContain(e)) break;
            }
            if (sp != null) break;
            ElementType parentType = e.parent();
            if (parentType == null) break;
            Element parent = new Element(parentType, defaultAttributes);
//      System.err.println("%% Ascending from " + e.name() + " to " + parent.name());
            parent.setNext(e);
            e = parent;
        }
        if (sp == null) return;    // don't know what to do
        while (theStack != sp) {
            if (theStack == null || theStack.next() == null ||
                    theStack.next().next() == null) break;
            restartablyPop();
        }
        while (e != null) {
            Element nexte = e.next();
            if (!e.name().equals("<pcdata>")) push(e);
            e = nexte;
            restart(e);
        }
        theNewElement = null;
    }
   
    public char getEntity() {
        return theEntity;
    }
   
    // Return the argument as a valid XML name, lowercased
    private String makeName(char[] buff, int offset, int length) {
        StringBuffer dst = new StringBuffer(length + 2);
        boolean seenColon = false;
        boolean start = true;
//    String src = new String(buff, offset, length); // DEBUG
        for (; length-- > 0; offset++) {
            char ch = Character.toLowerCase(buff[offset]);
            if (Character.isLetter(ch) || ch == '_') {
                start = false;
                dst.append(ch);
            } else if (Character.isDigit(ch) || ch == '-' || ch == '.') {
                if (start) dst.append('_');
                start = false;
                dst.append(ch);
            } else if (ch == ':' && !seenColon) {
                seenColon = true;
                if (start) dst.append('_');
                start = true;
                dst.append(translateColons ? '_' : ch);
            }
        }
        int dstLength = dst.length();
        if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_');
//    System.err.println("Made name \"" + dst + "\" from \"" + src + "\"");
        return dst.toString().intern();
    }
   
    // Default LexicalHandler implementation
   
    public void comment(char[] ch, int start, int length) throws SAXException { }
    public void endCDATA() throws SAXException { }
    public void endDTD() throws SAXException { }
    public void endEntity(String name) throws SAXException { }
    public void startCDATA() throws SAXException { }
    public void startDTD(String name, String publicid, String systemid) throws SAXException { }
    public void startEntity(String name) throws SAXException { }
   
}
TOP

Related Classes of org.ccil.cowan.tagsoup.Parser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.