Examples of org.htmlparser.lexer.Lexer

org.htmlparser.lexer.Lexer
This class parses the HTML stream into nodes. There are three major types of nodes (lexemes):
- Remark
- Text
- Tag
Each time nextNode() is called, another node is returned until the stream is exhausted, and null is returned.

  }


  @Test
  public void shouldAlsoCloneAttributes() throws ParserException, CloneNotSupportedException {
    String html = "<div class='foo'>funky <em>content</em></div>";
    Parser parser = new Parser(new Lexer(new Page(html)));
    NodeList tree = parser.parse(null);


    NodeList cloneTree = deepClone(tree);


    assertSame(Div.class, cloneTree.elementAt(0).getClass());

View Full Code Here

  }


  @Test
  public void flatCloneShouldJustGiveACopyOfANode() throws ParserException {
    String html = "<div class='foo'>funky <em>content</em></div>";
    Parser parser = new Parser(new Lexer(new Page(html)));
    NodeList tree = parser.parse(null);


    Node copy = flatClone(tree.elementAt(0));


    assertNull(copy.getParent());

View Full Code Here

    public void testParsingHtml() throws IOException, ParserException
    {
        String html = new Scanner(getClass().getResourceAsStream("response_destatis.txt"), "UTF-8") //
                        .useDelimiter("\\A").next();


        Lexer lexer = new Lexer(html);
        List<ConsumerPriceIndex> prices = new DestatisCPIFeed.Visitor().visit(lexer);


        assertThat(prices.size(), equalTo(19 /* years in file */* 12 + 6));


        ConsumerPriceIndex p = prices.get(5);

View Full Code Here

        {
            disableCertificateValidation();


            URL url = new URL(
                            "https://www.destatis.de/DE/ZahlenFakten/GesamtwirtschaftUmwelt/Preise/Verbraucherpreisindizes/Tabellen_/VerbraucherpreiseKategorien.html"); //$NON-NLS-1$
            Lexer lexer = new Lexer(url.openConnection());


            List<ConsumerPriceIndex> prices = new Visitor().visit(lexer);
            if (prices.isEmpty())
                throw new IOException(Messages.MsgResponseContainsNoIndices);

View Full Code Here

    }


    private void parseNode(List<AbstractContainer> nodes, int current, List<Item> items) {
        AbstractContainer node = nodes.get(current);
        final String content = node.getDataAsString();
        final Lexer lexer = new Lexer(content);
        Node cursor = null;


        try {
            while ((cursor = lexer.nextNode()) != null) {
                if (cursor instanceof Remark) {
                    items.add(new SimpleComment(cursor.getText()));
                } else if (cursor instanceof Text) {
                    items.add(new SimpleText(cursor.toHtml()));
                } else if (cursor instanceof org.htmlparser.Tag) {

View Full Code Here

    context.setJspExec(jspExec);
    
    // and finally, parse, using the special lexer that knows how to
    // handle javascript blocks containing unescaped HTML entities:
    Page lexPage = new Page(bais,charSet);
    Lexer lexer = new Lexer(lexPage);
    Lexer.STRICT_REMARKS = false;
      ContextAwareLexer lex = new ContextAwareLexer(lexer, context);


      Node node;
      while ((node = lex.nextNode()) != null) {

View Full Code Here

          "</body>" +
          "</html>";
      byte[] bytes = html.getBytes();
      ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
      Page page = new Page(bais, "UTF-8");
      Lexer lexer = new Lexer(page);
      Node node;
      while ((node = lexer.nextNode()) != null) {
        if (node instanceof Tag) {
          Tag tag = (Tag)node;
          if (tag.getTagName().equalsIgnoreCase("A") && !tag.isEndTag()) {
            assertEquals("href", "http://example.com/api?a=1&amp;b=2&c=3&#34;", tag.getAttribute("HREF"));

View Full Code Here

      final String html = "<![CDATA[aaaa\nbbbb]]>";


      byte[] bytes = html.getBytes();
      ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
      Page page = new Page(bais, "UTF-8");
      Lexer lexer = new Lexer(page);
      Node node;


      node = lexer.nextNode();
      // HTMLParser returns CDATA section as TagNode
      assertTrue(node instanceof TagNode);
      TagNode tag = (TagNode)node;
      // whose tagName is "![CDATA[" *plus* non-whitespace chars following
      // it.  And if they are alphabets, they get capitalized.

View Full Code Here




    // and finally, parse, using the special lexer that knows how to
    // handle javascript blocks containing unescaped HTML entities:
    Page lexPage = new Page(decodedResource,charSet);
    Lexer lexer = new Lexer(lexPage);
    Lexer.STRICT_REMARKS = false;
    ContextAwareLexer lex = new ContextAwareLexer(lexer, context);
    Node node;
    try {
      delegator.handleParseStart(context);

View Full Code Here

    ParseContext context = new ParseContext();
     
      Node node;
      try {
          ContextAwareLexer lex = new ContextAwareLexer(
              new Lexer(new Page(is,charSet)),context);
      while((node = lex.nextNode()) != null) {
//        System.err.println("\nDEBUG-Node:js("+context.isInJS()+")css("+context.isInCSS()+"):");
//        System.err.println("-------------------/START");
//        System.err.println(node.toHtml(true));
//        System.err.println("-------------------/END");

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of org.htmlparser.lexer.Lexer

com.apress.progwt.server.lucene.HTMLAnalyzer

com.apress.progwt.server.lucene.HTMLConverter

com.brewtab.ircbot.applets.TextsFromLastNightApplet

com.google.gdt.eclipse.designer.util.Utils

com.jeecms.cms.manager.assist.impl.CmsKeywordMngImpl

com.jeecms.common.util.StrUtils

fitnesse.fixtures.PageDriver

fitnesse.testsystems.slim.HtmlSlimTestSystem

fitnesse.testsystems.slim.HtmlTableScanner

fitnesse.util.HtmlParserToolsTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.