Examples of org.htmlparser.lexer.Lexer

org.htmlparser.lexer.Lexer
This class parses the HTML stream into nodes. There are three major types of nodes (lexemes):
- Remark
- Text
- Tag
Each time nextNode() is called, another node is returned until the stream is exhausted, and null is returned.

     */
    public Parser (URLConnection connection, ParserFeedback fb)
        throws
            ParserException
    {
        this (new Lexer (connection), fb);
    }

View Full Code Here

                    html = true;
                break;
            }
        }
        if (html)
            setLexer (new Lexer (new Page (resource)));
        else
            setLexer (new Lexer (getConnectionManager ().openConnection (resource)));
    }

View Full Code Here

        throws
            ParserException
    {
        if (null == connection)
            throw new IllegalArgumentException ("connection cannot be null");
        setLexer (new Lexer (connection));
    }

View Full Code Here

            ParserException
    {
        if (null == inputHTML)
            throw new IllegalArgumentException ("html cannot be null");
        if (!"".equals (inputHTML))
            setLexer (new Lexer (new Page (inputHTML)));
    }

View Full Code Here

        createParser(response,10000);
        parseNodes();
    }


    protected void createParser(String inputHTML) {
        mLexer =  new Lexer (new Page (inputHTML));
        parser = new Parser(mLexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
        node = new Node[40];
    }

View Full Code Here

        node = new Node[40];
    }


    protected void createParser(String inputHTML,int numNodes)
    {
        Lexer lexer = new Lexer (inputHTML);
        parser = new Parser (lexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
        node = new Node[numNodes];
    }

View Full Code Here

        parser = new Parser (lexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
        node = new Node[numNodes];
    }


    protected void createParser(String inputHTML, String url) {
        Lexer lexer = new Lexer (inputHTML);
        lexer.getPage ().setUrl (url);
        parser = new Parser (lexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
        node = new Node[40];
    }

View Full Code Here

        parser = new Parser (lexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
        node = new Node[40];
    }


    protected void createParser(String inputHTML, String url,int numNodes) {
        Lexer lexer = new Lexer (inputHTML);
        lexer.getPage ().setUrl (url);
        parser = new Parser (lexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
        node = new Node[numNodes];
    }

View Full Code Here

    public static Parser createParserParsingAnInputString (String input)
        throws ParserException, UnsupportedEncodingException
    {
   
        Parser parser = new Parser();
        Lexer lexer = new Lexer();
        Page page = new Page(input);
        lexer.setPage(page);
        parser.setLexer(lexer);
        
        return parser;
        
    }

View Full Code Here

        String query;
        String terms;
        StringBuffer buffer;
        HttpURLConnection connection;
        URL url;
        Lexer lexer;
        URL[][] results;


        prefs = Preferences.userNodeForPackage (getClass ());
        query = prefs.get (GOOGLEQUERY, DEFAULTGOOGLEQUERY);
        try
        {
            query = (String)JOptionPane.showInputDialog (
                this,
                "Enter the search term:",
                "Search Google",
                JOptionPane.PLAIN_MESSAGE,
                null,
                null,
                query);
            if (null != query)
            {
                // replace spaces with +
                terms = query.replace (' ', '+');
                buffer = new StringBuffer (1024);
                buffer.append ("http://www.google.ca/search?");
                buffer.append ("q=");
                buffer.append (terms);
                buffer.append ("&ie=UTF-8");
                buffer.append ("&oe=UTF-8");
                buffer.append ("&hl=en");
                buffer.append ("&btnG=Google+Search");
                buffer.append ("&meta=");
                url = new URL (buffer.toString ());
                connection = (HttpURLConnection)url.openConnection ();
                if (USE_MOZILLA_HEADERS)
                {
                    // These are the Mozilla header fields:
                    //Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,text/css,*/*;q=0.1
                    //Accept-Language: en-us, en;q=0.50
                    //Connection: keep-alive
                    //Host: grc.com
                    //Referer: https://grc.com/x/ne.dll?bh0bkyd2
                    //User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1) Gecko/20030225
                    //Content-Length: 27
                    //Content-Type: application/x-www-form-urlencoded
                    //Accept-Encoding: gzip, deflate, compress;q=0.9
                    //Accept-Charset: ISO-8859-1, utf-8;q=0.66, *;q=0.66
                    //Keep-Alive: 300


                    connection.setRequestProperty ("Referer", "http://www.google.ca");
                    connection.setRequestProperty ("Accept", "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,text/css,*/*;q=0.1");
                    connection.setRequestProperty ("Accept-Language", "en-us, en;q=0.50");
                    connection.setRequestProperty ("User-Agent", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1) Gecko/20030225");
                    connection.setRequestProperty ("Accept-Charset", "ISO-8859-1, utf-8;q=0.66, *;q=0.66");
                }
                else
                {
                    // These are the IE header fields:
                    //Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*
                    //Accept-Language: en-ca
                    //Connection: Keep-Alive
                    //Host: grc.com
                    //User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; trieste; .NET CLR 1.1.4322; .NET CLR 1.0.3705)
                    //Content-Length: 32
                    //Content-Type: application/x-www-form-urlencoded
                    //Accept-Encoding: gzip, deflate
                    //Cache-Control: no-cache


                    connection.setRequestProperty ("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
                    connection.setRequestProperty ("Accept-Language", "en-ca");
                    connection.setRequestProperty ("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; trieste; .NET CLR 1.1.4322; .NET CLR 1.0.3705)");
                }
                connection.setDoOutput (true);
                connection.setDoInput (true);
                connection.setUseCaches (false);
                lexer = new Lexer (connection);
                results = getThumbelina ().extractImageLinks (lexer, url);
                // add 'em
                getThumbelina ().reset ();
                // remove google links, not just append (results[1]);
                for (int i = 0; i < results[1].length; i++)

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of org.htmlparser.lexer.Lexer

com.apress.progwt.server.lucene.HTMLAnalyzer

com.apress.progwt.server.lucene.HTMLConverter

com.brewtab.ircbot.applets.TextsFromLastNightApplet

com.google.gdt.eclipse.designer.util.Utils

com.jeecms.cms.manager.assist.impl.CmsKeywordMngImpl

com.jeecms.common.util.StrUtils

fitnesse.fixtures.PageDriver

fitnesse.testsystems.slim.HtmlSlimTestSystem

fitnesse.testsystems.slim.HtmlTableScanner

fitnesse.util.HtmlParserToolsTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.