Examples of org.htmlparser.Parser

org.htmlparser.Parser

ahoo.com",new DefaultHTMLParserFeedback()); // In this example, we are registering all the common scanners parser.registerScanners(); for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode(); node.print(); } Below is some sample code to parse Yahoo.com and print only the text information. This scanning will run faster, as there are no scanners registered here.

 Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); // In this example, none of the scanners need to be registered // as a string node is not a tag to be scanned for. for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode(); if (node instanceof StringNode) {        	 StringNode stringNode = (StringNode)node;         System.out.println(stringNode.getText());     }  }

The above snippet will print out only the text contents in the html document.
Here's another snippet that will only print out the link urls in a document. This is an example of adding a link scanner.

 Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); parser.addScanner(new LinkScanner("-l")); for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode();     if (node instanceof LinkTag) { LinkTag linkTag = (LinkTag)node;         System.out.println(linkTag.getLink());     }  }

@see Parser#elements()

     * header by a server-side web application.
     * Nonetheless, it would be nice to handle this case.
     */
    public void testSingleQuotedCharset() throws ParserException
    {
        Parser parser;
        String url =
            "http://htmlparser.sourceforge.net/test/SinglequotedCharset.html";


        parser = new Parser(url);
        for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
            e.nextNode();
        assertTrue("Wrong encoding", parser.getEncoding().equals("UTF-8"));
    }

View Full Code Here

     */
    public void testCommaListCharset() throws ParserException
    {
        URL url;
        URLConnection connection;
        Parser parser;
        String idiots = "http://users.aol.com/geinster/rej.htm";


        try
        {
            url = new URL(idiots);
            connection = url.openConnection();
            // this little subclass just gets around normal JDK 1.4 processing
            // that filters out bogus character sets
            parser = new Parser()
            {
                protected String getCharset(String content)
                {
                    int index;
                    String ret;


                    ret = DEFAULT_CHARSET;
                    if (null != content)
                    {
                        index = content.indexOf(CHARSET_STRING);


                        if (index != -1)
                        {
                            content =
                                content
                                    .substring(index + CHARSET_STRING.length())
                                    .trim();
                            if (content.startsWith("="))
                            {
                                content = content.substring(1).trim();
                                index = content.indexOf(";");
                                if (index != -1)
                                    content = content.substring(0, index);


                                //remove any double quotes from around charset string
                                if (content.startsWith("\"")
                                    && content.endsWith("\"")
                                    && (1 < content.length()))
                                    content =
                                        content.substring(
                                            1,
                                            content.length() - 1);


                                //remove any single quote from around charset string
                                if (content.startsWith("'")
                                    && content.endsWith("'")
                                    && (1 < content.length()))
                                    content =
                                        content.substring(
                                            1,
                                            content.length() - 1);


                                ret = content;
                                // short circuit findCharset() processing
                            }
                        }
                    }


                    return (ret);
                }
            };
            parser.setConnection(connection);
            // must be the default
            assertTrue(
                "Wrong encoding",
                parser.getEncoding().equals("ISO-8859-1"));
            for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
                e.nextNode();
            assertTrue(
                "Wrong encoding",
                parser.getEncoding().equals("windows-1252"));
        }
        catch (Exception e)
        {
            fail(e.getMessage());
        }

View Full Code Here

        }
    }


    public void testNullUrl()
    {
        Parser parser;
        try
        {
            parser =
                new Parser("http://someoneexisting.com", Parser.noFeedback);
            assertTrue("Should have thrown an exception!", false);
        }
        catch (ParserException e)
        {

View Full Code Here

        }
    }


    public void testURLWithSpaces() throws ParserException
    {
        Parser parser;
        String url =
            "http://htmlparser.sourceforge.net/test/This is a Test Page.html";


        parser = new Parser(url);
        Node node[] = new Node[30];
        int i = 0;
        for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
        {
            node[i] = e.nextNode();
            i++;


        }

View Full Code Here

{
    FormTag formTag;
    Vector formChildren;
    public void setUp() throws Exception
    {
        Parser parser = Parser.createParser(FormScannerTest.FORM_HTML);
        parser.registerScanners();
        NodeIterator e = parser.elements();
        Node node = e.nextNode();
        formTag = (FormTag) node;
        formChildren = new Vector();
        for (SimpleNodeIterator se = formTag.children(); se.hasMoreNodes();)
        {

View Full Code Here

     * with the standard scanners registered.
     */
    public Generate() throws ParserException
    {
        parser =
            new Parser("http://www.w3.org/TR/REC-html40/sgml/entities.html");
        parser.registerScanners();
    }

View Full Code Here

    protected void createParser(String inputHTML)
    {
        String testHTML = new String(inputHTML);
        StringReader sr = new StringReader(testHTML);
        reader = new NodeReader(new BufferedReader(sr), 5000);
        parser = new Parser(reader, new DefaultParserFeedback());
        node = new Node[40];
    }

View Full Code Here

    protected void createParser(String inputHTML, int numNodes)
    {
        String testHTML = new String(inputHTML);
        StringReader sr = new StringReader(testHTML);
        reader = new NodeReader(new BufferedReader(sr), 5000);
        parser = new Parser(reader, new DefaultParserFeedback());
        node = new Node[numNodes];
    }

View Full Code Here

    protected void createParser(String inputHTML, String url)
    {
        String testHTML = new String(inputHTML);
        StringReader sr = new StringReader(testHTML);
        reader = new NodeReader(new BufferedReader(sr), url);
        parser = new Parser(reader, new DefaultParserFeedback());
        node = new Node[40];
    }

View Full Code Here

    protected void createParser(String inputHTML, String url, int numNodes)
    {
        String testHTML = new String(inputHTML);
        StringReader sr = new StringReader(testHTML);
        reader = new NodeReader(new BufferedReader(sr), url);
        parser = new Parser(reader, new DefaultParserFeedback());
        node = new Node[numNodes];
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.htmlparser.Parser

cn.edu.pku.dr.requirement.elicitation.tools.HtmlTransformer

com.gnizr.core.util.FormatUtil

com.knowgate.hipermail.DBMimeMessage

com.knowgate.hipermail.HtmlMimeBodyPart

com.lanyuan.util.HttpClientUtils

com.vgo.movie.thread.DetailFilmThread

com.waxayaz.TomcatMI.core.utils.repoManager.TomcatRepositoryManager

com.wordpress.util.StringUtil

fitnesse.fixtures.PageDriver

fitnesse.slim.converters.MapEditor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.