Source Code of nokogiri.HtmlDocument

/**
 * (The MIT License)
 *
 * Copyright (c) 2008 - 2012:
 *
 * * {Aaron Patterson}[http://tenderlovemaking.com]
 * * {Mike Dalessio}[http://mike.daless.io]
 * * {Charles Nutter}[http://blog.headius.com]
 * * {Sergio Arbeo}[http://www.serabe.com]
 * * {Patrick Mahoney}[http://polycrystal.org]
 * * {Yoko Harada}[http://yokolet.blogspot.com]
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * 'Software'), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */


package nokogiri;


import nokogiri.internals.HtmlDomParserContext;
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.javasupport.util.RuntimeHelpers;
import org.jruby.runtime.Arity;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;


/**
 * Class for Nokogiri::HTML::Document.
 *
 * @author sergio
 * @author Yoko Harada <yokolet@gmail.com>
 */
@JRubyClass(name="Nokogiri::HTML::Document", parent="Nokogiri::XML::Document")
public class HtmlDocument extends XmlDocument {
    private static final String DEFAULT_CONTENT_TYPE = "html";
    private static final String DEFAULT_PUBLIC_ID = "-//W3C//DTD HTML 4.01//EN";
    private static final String DEFAULT_SYTEM_ID = "http://www.w3.org/TR/html4/strict.dtd";


    private String parsed_encoding = null;


    public HtmlDocument(Ruby ruby, RubyClass klazz) {
        super(ruby, klazz);
    }
    
    public HtmlDocument(Ruby ruby, RubyClass klazz, Document doc) {
        super(ruby, klazz, doc);
    }


    @JRubyMethod(name="new", meta = true, rest = true, required=0)
    public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz,
                                    IRubyObject[] args) {
        HtmlDocument htmlDocument = null;
        try {
            Document docNode = createNewDocument();
            htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass) klazz);
            htmlDocument.setDocumentNode(context, docNode);
        } catch (Exception ex) {
            throw context.getRuntime().newRuntimeError("couldn't create document: "+ex.toString());
        }


        RuntimeHelpers.invoke(context, htmlDocument, "initialize", args);


        return htmlDocument;
    }


    public IRubyObject getInternalSubset(ThreadContext context) {
        IRubyObject internalSubset = super.getInternalSubset(context);


        // html documents are expected to have a default internal subset
        // the default values are the same ones used when the following
        // feature is turned on
        // "http://cyberneko.org/html/features/insert-doctype"
        // the reason we don't turn it on, is because it overrides the document's
        // declared doctype declaration.


        if (internalSubset.isNil()) {
            internalSubset = XmlDtd.newEmpty(context.getRuntime(),
                                             getDocument(),
                                             context.getRuntime().newString(DEFAULT_CONTENT_TYPE),
                                             context.getRuntime().newString(DEFAULT_PUBLIC_ID),
                                             context.getRuntime().newString(DEFAULT_SYTEM_ID));
            setInternalSubset(internalSubset);
        }


        return internalSubset;
    }


    public static IRubyObject do_parse(ThreadContext context,
                                       IRubyObject klass,
                                       IRubyObject[] args) {
        Ruby ruby = context.getRuntime();
        Arity.checkArgumentCount(ruby, args, 4, 4);
        HtmlDomParserContext ctx =
            new HtmlDomParserContext(ruby, args[2], args[3]);
        ctx.setInputSource(context, args[0], args[1]);
        return ctx.parse(context, klass, args[1]);
    }
    
    public void setDocumentNode(ThreadContext context, Node node) {
        super.setNode(context, node);
        Ruby runtime = context.getRuntime();
        if (node != null) {
            Document document = (Document)node;
            document.normalize();
            stabilzeAttrValue(document.getDocumentElement());
        }
        setInstanceVariable("@decorators", runtime.getNil());
    }
    
    private void stabilzeAttrValue(Node node) {
        if (node == null) return;
        if (node.hasAttributes()) {
            NamedNodeMap nodeMap = node.getAttributes();
            for (int i=0; i<nodeMap.getLength(); i++) {
                Node n = nodeMap.item(i);
                if (n instanceof Attr) {
                    Attr attr = (Attr)n;
                    String attrName = attr.getName();
                    // not sure, but need to get value always before document is referred.
                    // or lose attribute value
                    String attrValue = attr.getValue(); // don't delete this line
                }
            }
        }
        NodeList children = node.getChildNodes();
        for (int i=0; i<children.getLength(); i++) {
            stabilzeAttrValue(children.item(i));
        }
    }
    
    public void setParsedEncoding(String encoding) {
        parsed_encoding = encoding;
    }
    
    public String getPraedEncoding() {
        return parsed_encoding;
    }


    /*
     * call-seq:
     *  read_io(io, url, encoding, options)
     *
     * Read the HTML document from +io+ with given +url+, +encoding+,
     * and +options+.  See Nokogiri::HTML.parse
     */
    @JRubyMethod(meta = true, rest = true)
    public static IRubyObject read_io(ThreadContext context,
                                      IRubyObject cls,
                                      IRubyObject[] args) {
        return do_parse(context, cls, args);
    }


    /*
     * call-seq:
     *  read_memory(string, url, encoding, options)
     *
     * Read the HTML document contained in +string+ with given +url+, +encoding+,
     * and +options+.  See Nokogiri::HTML.parse
     */
    @JRubyMethod(meta = true, rest = true)
    public static IRubyObject read_memory(ThreadContext context,
                                          IRubyObject cls,
                                          IRubyObject[] args) {
        return do_parse(context, cls, args);
    }
}
Source Code of nokogiri.HtmlDocument

Related Classes of nokogiri.HtmlDocument