Package nokogiri

Source Code of nokogiri.HtmlSaxParserContext

/**
* (The MIT License)
*
* Copyright (c) 2008 - 2011:
*
* * {Aaron Patterson}[http://tenderlovemaking.com]
* * {Mike Dalessio}[http://mike.daless.io]
* * {Charles Nutter}[http://blog.headius.com]
* * {Sergio Arbeo}[http://www.serabe.com]
* * {Patrick Mahoney}[http://polycrystal.org]
* * {Yoko Harada}[http://yokolet.blogspot.com]
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* 'Software'), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package nokogiri;

import static nokogiri.internals.NokogiriHelpers.rubyStringToString;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.EnumSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import nokogiri.internals.NokogiriHandler;

import org.apache.xerces.parsers.AbstractSAXParser;
import org.cyberneko.html.parsers.SAXParser;
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyFixnum;
import org.jruby.RubyString;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.xml.sax.SAXException;

/**
* Class for Nokogiri::HTML::SAX::ParserContext.
*
* @author serabe
* @author Patrick Mahoney <pat@polycrystal.org>
* @author Yoko Harada <yokolet@gmail.com>
*/

@JRubyClass(name="Nokogiri::HTML::SAX::ParserContext", parent="Nokogiri::XML::SAX::ParserContext")
public class HtmlSaxParserContext extends XmlSaxParserContext {

    public HtmlSaxParserContext(Ruby ruby, RubyClass rubyClass) {
        super(ruby, rubyClass);
    }
   
    @Override
    protected AbstractSAXParser createParser() throws SAXException {
        SAXParser parser = new SAXParser();

        try{
            parser.setProperty(
                "http://cyberneko.org/html/properties/names/elems", "lower");
            parser.setProperty(
                "http://cyberneko.org/html/properties/names/attrs", "lower");
            return parser;
        } catch(SAXException ex) {
            throw new SAXException(
                "Problem while creating HTML SAX Parser: " + ex.toString());
        }
    }

    @JRubyMethod(name="memory", meta=true)
    public static IRubyObject parse_memory(ThreadContext context,
                                           IRubyObject klazz,
                                           IRubyObject data,
                                           IRubyObject encoding) {
        HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
        ctx.initialize(context.getRuntime());
        String javaEncoding = findEncoding(context, encoding);
        if (javaEncoding != null) {
            String input = applyEncoding(rubyStringToString(data), javaEncoding);
            ByteArrayInputStream istream = new ByteArrayInputStream(input.getBytes());
            ctx.setInputSource(istream);
            ctx.getInputSource().setEncoding(javaEncoding);
        }
        return ctx;
    }
   
    public static enum EncodingType {
        NONE(0, "NONE"),
        UTF_8(1, "UTF-8"),
        UTF16LE(2, "UTF16LE"),
        UTF16BE(3, "UTF16BE"),
        UCS4LE(4, "UCS4LE"),
        UCS4BE(5, "UCS4BE"),
        EBCDIC(6, "EBCDIC"),
        UCS4_2143(7, "ICS4-2143"),
        UCS4_3412(8, "UCS4-3412"),
        UCS2(9, "UCS2"),
        ISO_8859_1(10, "ISO-8859-1"),
        ISO_8859_2(11, "ISO-8859-2"),
        ISO_8859_3(12, "ISO-8859-3"),
        ISO_8859_4(13, "ISO-8859-4"),
        ISO_8859_5(14, "ISO-8859-5"),
        ISO_8859_6(15, "ISO-8859-6"),
        ISO_8859_7(16, "ISO-8859-7"),
        ISO_8859_8(17, "ISO-8859-8"),
        ISO_8859_9(18, "ISO-8859-9"),
        ISO_2022_JP(19, "ISO-2022-JP"),
        SHIFT_JIS(20, "SHIFT-JIS"),
        EUC_JP(21, "EUC-JP"),
        ASCII(22, "ASCII");
       
        private final int value;
        private final String name;
        EncodingType(int value, String name) {
            this.value = value;
            this.name = name;
        }
       
        public int getValue() {
            return value;
        }
       
        public String toString() {
            return name;
        }
    }
   
    private static String findName(int value) {
        EnumSet<EncodingType> set = EnumSet.allOf(EncodingType.class);
        for (EncodingType type : set) {
            if (type.getValue() == value) return type.toString();
        }
        return null;
    }
   
    private static String findEncoding(ThreadContext context, IRubyObject encoding) {
        String rubyEncoding = null;
        if (encoding instanceof RubyString) {
            rubyEncoding = rubyStringToString(encoding);
        } else if (encoding instanceof RubyFixnum) {
            int value = (Integer)encoding.toJava(Integer.class);
            rubyEncoding = findName(value);
        }
        if (rubyEncoding == null) return null;
        try {
            Charset charset = Charset.forName(rubyEncoding);
            return charset.displayName();
        } catch (IllegalCharsetNameException e) {
            throw context.getRuntime().newEncodingCompatibilityError(
                    rubyEncoding + "is not supported in Java.");
        } catch (IllegalArgumentException e) {
            throw context.getRuntime().newInvalidEncoding(
                    "encoding should not be nil");
        }
    }
   
    private static String applyEncoding(String input, String enc) {
        String str = input.toLowerCase();
        int start_pos = 0;
        int end_pos = 0;
        if (input.contains("meta") && input.contains("charset")) {
            Pattern p = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+");
            Matcher m = p.matcher(str);
            while (m.find()) {
                start_pos = m.start();
                end_pos = m.end();
            }
        }
        if (start_pos != end_pos) {
            String substr = input.substring(start_pos, end_pos);
            input = input.replace(substr, "charset=" + enc);
        }
        return input;
    }

    @JRubyMethod(name="file", meta=true)
    public static IRubyObject parse_file(ThreadContext context,
                                         IRubyObject klazz,
                                         IRubyObject data,
                                         IRubyObject encoding) {
        HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
        ctx.initialize(context.getRuntime());
        ctx.setInputSourceFile(context, data);
        String javaEncoding = findEncoding(context, encoding);
        if (javaEncoding != null) {
            ctx.getInputSource().setEncoding(javaEncoding);
        }
        return ctx;
    }

    @JRubyMethod(name="io", meta=true)
    public static IRubyObject parse_io(ThreadContext context,
                                       IRubyObject klazz,
                                       IRubyObject data,
                                       IRubyObject encoding) {
        HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
        ctx.initialize(context.getRuntime());
        ctx.setInputSource(context, data, context.getRuntime().getNil());
        String javaEncoding = findEncoding(context, encoding);
        if (javaEncoding != null) {
            ctx.getInputSource().setEncoding(javaEncoding);
        }
        return ctx;
    }

    /**
     * Create a new parser context that will read from a raw input
     * stream. Not a JRuby method.  Meant to be run in a separate
     * thread by XmlSaxPushParser.
     */
    public static IRubyObject parse_stream(ThreadContext context,
                                           IRubyObject klazz,
                                           InputStream stream) {
        HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
        ctx.initialize(context.getRuntime());
        ctx.setInputSource(stream);
        return ctx;
    }

    @Override
    protected void preParse(ThreadContext context,
                             IRubyObject handlerRuby,
                             NokogiriHandler handler) {
        // final String path = "Nokogiri::XML::FragmentHandler";
        // final String docFrag =
        //     "http://cyberneko.org/html/features/balance-tags/document-fragment";
        // RubyObjectAdapter adapter = JavaEmbedUtils.newObjectAdapter();
        // IRubyObject doc = adapter.getInstanceVariable(handlerRuby, "@document");
        // RubyModule mod =
        //     context.getRuntime().getClassFromPath(path);
        // try {
        //     if (doc != null && !doc.isNil() && adapter.isKindOf(doc, mod))
        //         parser.setFeature(docFrag, true);
        // } catch (Exception e) {
        //     // ignore
        // }
    }

}
TOP

Related Classes of nokogiri.HtmlSaxParserContext

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.