Examples of org.htmlparser.lexer.Lexer

org.htmlparser.lexer.Lexer
This class parses the HTML stream into nodes. There are three major types of nodes (lexemes):
- Remark
- Text
- Tag
Each time nextNode() is called, another node is returned until the stream is exhausted, and null is returned.

    }


    // Using HTMLParser to extract the content
    String cleanedContent = null;
    Page htmlPage = new Page(cuttedContent, "UTF-8");
    Parser parser = new Parser(new Lexer(htmlPage));
    StringBean stringBean = new StringBean();


    // replace multiple whitespace with one whitespace
    stringBean.setCollapse(true);
    // Do not extract URLs
    stringBean.setLinks(false);
    // replace &nbsp; with whitespace
    stringBean.setReplaceNonBreakingSpaces(true);


    try {
      // Parse the content
      parser.visitAllNodesWith(stringBean);
      cleanedContent = stringBean.getStrings();


    } catch (ParserException ex) {
      throw new RegainException("Error while parsing content: ", ex);
    }


    // The result of parsing the html-content
    setCleanedContent(cleanedContent);


    // Extract links
    LinkVisitor linkVisitor = new LinkVisitor();
    if (isContentCutted) {
      // This means a new parser run which is expensive but neccessary
      htmlPage = new Page(rawDocument.getContentAsString(), "UTF-8");
      parser = new Parser(new Lexer(htmlPage));
    } else {
      parser.reset();
    }


    try {

View Full Code Here

    @Override
    protected String toString(Object o,
            ResourcePropertyMapping resourcePropertyMapping,
            MarshallingContext context) {
        String str = (String) o;
        Lexer l = new Lexer(str);
        Parser parser = new Parser(l);
        StringBean sb = new StringBean();


        try {
            parser.visitAllNodesWith(sb);

View Full Code Here

            String s;
            while ((s = br.readLine()) != null) {
                stringBuilder.append(s);
            }


            Lexer l = new Lexer(stringBuilder.toString());
            Parser parser = new Parser(l);
            StringBean sb = new StringBean();


            parser.visitAllNodesWith(sb);

View Full Code Here

        try {
            page = new Page(in, null);
        } catch (final UnsupportedEncodingException e) {
            throw new ScimpiException(e);
        }
        final Lexer lexer = new Lexer(page);


        Node node = null;
        try {
            Stack<Snippet> tags = allTags;
            String lineNumbers = "1";
            String template = null;
            tags.push(new HtmlSnippet(lineNumbers, filePath));


            // NOTE done like this the tags can be cached for faster processing
            while ((node = lexer.nextNode()) != null) {
                if (node instanceof Remark) {
                    // TODO need to pick up on comments within tags; at the
                    // moment this splits a tag into two causing a
                    // failure later
                    continue;

View Full Code Here

        this("http://www.textsfromlastnight.com/Random-Texts-From-Last-Night.html");
    }


    private void populateTexts() throws InterruptedException {
        HttpURLConnection connection;
        Lexer lexer;


        try {
            connection = (HttpURLConnection) this.url.openConnection();
            lexer = new Lexer(connection);
        } catch (IOException e) {
            log.warn("could not open connection", e);
            this.errorMessage.put("Could not establish a connection to textsfromlastnight.com");
            return;


        } catch (ParserException e) {
            log.error("parse error", e);
            this.errorMessage.put("Parser error");
            return;
        }


        Node node = null;


        while (true) {
            try {
                node = lexer.nextNode();
                if (node == null) {
                    break;
                }


                if (node instanceof Tag) {
                    Tag tag = (Tag) node;
                    String tagReadonly = tag.getAttribute("readonly");
                    String tagName = tag.getRawTagName();


                    if (tagName.equals("textarea") && tagReadonly != null && tagReadonly.equals("readonly")) {
                        node = lexer.nextNode();
                        if (node instanceof Text) {
                            String text = ((Text) node).getText();
                            text = Translate.decode(text);
                            this.texts.put(text);
                        }

View Full Code Here

  private static List<TagNode> getHTMLFileTags(IFile htmlFile, String tagName) throws Exception {
    // find nodes
    Node[] tags;
    {
      String htmlContents = IOUtils2.readString(htmlFile);
      Lexer lexer = new Lexer(new Page(htmlContents));
      Parser parser = new Parser(lexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
      TagFindingVisitor visitor = new TagFindingVisitor(new String[]{tagName});
      parser.visitAllNodesWith(visitor);
      tags = visitor.getTags(0);
    }

View Full Code Here

     * @return The line number, or -1 if none is available.
     * @see #getColumnNumber
     */
    public int getLineNumber ()
    {
        Lexer lexer;
        
        lexer = mParser.getLexer ();
        return (lexer.getPage ().row (lexer.getCursor ()));
    }

View Full Code Here

     * @return The column number, or -1 if none is available.
     * @see #getLineNumber
     */
    public int getColumnNumber ()
    {
        Lexer lexer;
        
        lexer = mParser.getLexer ();
        return (lexer.getPage ().column (lexer.getCursor ()));
    }

View Full Code Here


        if (null != mContentHandler)
            try
            {
                mParser = new Parser (
                    new Lexer (
                        new Page (
                            input.getByteStream (),
                            input.getEncoding ())));
                locator = new Locator (mParser);
                if (null != mErrorHandler)

View Full Code Here

        try {
            page = new Page(in, null);
        } catch (final UnsupportedEncodingException e) {
            throw new ScimpiException(e);
        }
        final Lexer lexer = new Lexer(page);


        Node node = null;
        try {
            Stack<Snippet> tags = allTags;
            String lineNumbers = "1";
            String template = null;
            tags.push(new HtmlSnippet(lineNumbers, filePath));


            // NOTE done like this the tags can be cached for faster processing
            while ((node = lexer.nextNode()) != null) {
                if (node instanceof Remark) {
                    // TODO need to pick up on comments within tags; at the
                    // moment this splits a tag into two causing a
                    // failure later
                    continue;

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of org.htmlparser.lexer.Lexer

com.apress.progwt.server.lucene.HTMLAnalyzer

com.apress.progwt.server.lucene.HTMLConverter

com.brewtab.ircbot.applets.TextsFromLastNightApplet

com.google.gdt.eclipse.designer.util.Utils

com.jeecms.cms.manager.assist.impl.CmsKeywordMngImpl

com.jeecms.common.util.StrUtils

fitnesse.fixtures.PageDriver

fitnesse.testsystems.slim.HtmlSlimTestSystem

fitnesse.testsystems.slim.HtmlTableScanner

fitnesse.util.HtmlParserToolsTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.