Package org.htmlparser.lexer

Examples of org.htmlparser.lexer.Lexer


  }

  @Test
  public void shouldAlsoCloneAttributes() throws ParserException, CloneNotSupportedException {
    String html = "<div class='foo'>funky <em>content</em></div>";
    Parser parser = new Parser(new Lexer(new Page(html)));
    NodeList tree = parser.parse(null);

    NodeList cloneTree = deepClone(tree);

    assertSame(Div.class, cloneTree.elementAt(0).getClass());
View Full Code Here


  }

  @Test
  public void flatCloneShouldJustGiveACopyOfANode() throws ParserException {
    String html = "<div class='foo'>funky <em>content</em></div>";
    Parser parser = new Parser(new Lexer(new Page(html)));
    NodeList tree = parser.parse(null);

    Node copy = flatClone(tree.elementAt(0));

    assertNull(copy.getParent());
View Full Code Here

    public void testParsingHtml() throws IOException, ParserException
    {
        String html = new Scanner(getClass().getResourceAsStream("response_destatis.txt"), "UTF-8") //
                        .useDelimiter("\\A").next();

        Lexer lexer = new Lexer(html);
        List<ConsumerPriceIndex> prices = new DestatisCPIFeed.Visitor().visit(lexer);

        assertThat(prices.size(), equalTo(19 /* years in file */* 12 + 6));

        ConsumerPriceIndex p = prices.get(5);
View Full Code Here

        {
            disableCertificateValidation();

            URL url = new URL(
                            "https://www.destatis.de/DE/ZahlenFakten/GesamtwirtschaftUmwelt/Preise/Verbraucherpreisindizes/Tabellen_/VerbraucherpreiseKategorien.html"); //$NON-NLS-1$
            Lexer lexer = new Lexer(url.openConnection());

            List<ConsumerPriceIndex> prices = new Visitor().visit(lexer);
            if (prices.isEmpty())
                throw new IOException(Messages.MsgResponseContainsNoIndices);
View Full Code Here

    }

    private void parseNode(List<AbstractContainer> nodes, int current, List<Item> items) {
        AbstractContainer node = nodes.get(current);
        final String content = node.getDataAsString();
        final Lexer lexer = new Lexer(content);
        Node cursor = null;

        try {
            while ((cursor = lexer.nextNode()) != null) {
                if (cursor instanceof Remark) {
                    items.add(new SimpleComment(cursor.getText()));
                } else if (cursor instanceof Text) {
                    items.add(new SimpleText(cursor.toHtml()));
                } else if (cursor instanceof org.htmlparser.Tag) {
View Full Code Here

    context.setJspExec(jspExec);
   
    // and finally, parse, using the special lexer that knows how to
    // handle javascript blocks containing unescaped HTML entities:
    Page lexPage = new Page(bais,charSet);
    Lexer lexer = new Lexer(lexPage);
    Lexer.STRICT_REMARKS = false;
      ContextAwareLexer lex = new ContextAwareLexer(lexer, context);

      Node node;
      while ((node = lex.nextNode()) != null) {
View Full Code Here

          "</body>" +
          "</html>";
      byte[] bytes = html.getBytes();
      ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
      Page page = new Page(bais, "UTF-8");
      Lexer lexer = new Lexer(page);
      Node node;
      while ((node = lexer.nextNode()) != null) {
        if (node instanceof Tag) {
          Tag tag = (Tag)node;
          if (tag.getTagName().equalsIgnoreCase("A") && !tag.isEndTag()) {
            assertEquals("href", "http://example.com/api?a=1&amp;b=2&c=3&#34;", tag.getAttribute("HREF"));
View Full Code Here

      final String html = "<![CDATA[aaaa\nbbbb]]>";

      byte[] bytes = html.getBytes();
      ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
      Page page = new Page(bais, "UTF-8");
      Lexer lexer = new Lexer(page);
      Node node;

      node = lexer.nextNode();
      // HTMLParser returns CDATA section as TagNode
      assertTrue(node instanceof TagNode);
      TagNode tag = (TagNode)node;
      // whose tagName is "![CDATA[" *plus* non-whitespace chars following
      // it.  And if they are alphabets, they get capitalized.
View Full Code Here


    // and finally, parse, using the special lexer that knows how to
    // handle javascript blocks containing unescaped HTML entities:
    Page lexPage = new Page(decodedResource,charSet);
    Lexer lexer = new Lexer(lexPage);
    Lexer.STRICT_REMARKS = false;
    ContextAwareLexer lex = new ContextAwareLexer(lexer, context);
    Node node;
    try {
      delegator.handleParseStart(context);
View Full Code Here

    ParseContext context = new ParseContext();
    
      Node node;
      try {
          ContextAwareLexer lex = new ContextAwareLexer(
              new Lexer(new Page(is,charSet)),context);
      while((node = lex.nextNode()) != null) {
//        System.err.println("\nDEBUG-Node:js("+context.isInJS()+")css("+context.isInCSS()+"):");
//        System.err.println("-------------------/START");
//        System.err.println(node.toHtml(true));
//        System.err.println("-------------------/END");
View Full Code Here

TOP

Related Classes of org.htmlparser.lexer.Lexer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.