Examples of org.htmlparser.Parser

org.htmlparser.Parser

ahoo.com",new DefaultHTMLParserFeedback()); // In this example, we are registering all the common scanners parser.registerScanners(); for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode(); node.print(); } Below is some sample code to parse Yahoo.com and print only the text information. This scanning will run faster, as there are no scanners registered here.

 Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); // In this example, none of the scanners need to be registered // as a string node is not a tag to be scanned for. for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode(); if (node instanceof StringNode) {        	 StringNode stringNode = (StringNode)node;         System.out.println(stringNode.getText());     }  }

The above snippet will print out only the text contents in the html document.
Here's another snippet that will only print out the link urls in a document. This is an example of adding a link scanner.

 Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); parser.addScanner(new LinkScanner("-l")); for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode();     if (node instanceof LinkTag) { LinkTag linkTag = (LinkTag)node;         System.out.println(linkTag.getLink());     }  }

@see Parser#elements()

        ServletOutputStream out = response.getOutputStream();
        
        try{


            htmlBuffer = "<html>" + htmlBuffer + "</html>";
            Parser parser = new Parser();
            parser.setInputHTML(htmlBuffer);
            NodeList nodelist = parser.parse(null);


            NodeList tableList = nodelist.extractAllNodesThatMatch(new TagNameFilter("TABLE"), true);
            NodeList  headList = tableList.extractAllNodesThatMatch(new TagNameFilter("THEAD"), true);
            NodeList  footList = tableList.extractAllNodesThatMatch(new TagNameFilter("TFOOT"), true);
            NodeList  rowList = tableList.extractAllNodesThatMatch(new TagNameFilter("TR"), true);

View Full Code Here


        if (log.isDebugEnabled()) {
            log.debug("Parsing html of: " + baseUrl);
        }


        Parser htmlParser = null;
        try {
            String contents = new String(html,encoding); 
            htmlParser = new Parser();
            htmlParser.setInputHTML(contents);
        } catch (Exception e) {
            throw new HTMLParseException(e);
        }


        // Now parse the DOM tree
        try {
            // we start to iterate through the elements
            parseNodes(htmlParser.elements(), new URLPointer(baseUrl), urls);
            log.debug("End   : parseNodes");
        } catch (ParserException e) {
            throw new HTMLParseException(e);
        }

View Full Code Here

    
  }
  
  public String getGraphSourceString(String html) {
    try {
      Parser parser = new Parser(html);
      Node node = getElementById(parser, GRAPH_STRING_ID);
      return node != null ? node.toPlainTextString().replaceAll(LINE_BREAK, "\n") : "";
    } catch (ParserException e) {
      e.printStackTrace();
    }

View Full Code Here

  
  public void testGetElementById() {
    // Should output:
    //      This is a level 2 span
    try {
      Parser parser = new Parser(TEST_STRING);
      Node node = getElementById(parser, "L2");
      if (node != null) {
        System.out.println(node.toPlainTextString());
      } else {
        System.out.println("null");

View Full Code Here


    }


    public static String getGraphSourceString(String html) {
        try {
            Parser parser = new Parser(html);
            Node node = getElementById(parser, GRAPH_STRING_ID);
            return node != null ? node.toPlainTextString().replaceAll(
                    LINE_BREAK, "\n") : "";
        } catch (ParserException e) {
            e.printStackTrace();

View Full Code Here


    public void testGetElementById() {
        // Should output:
        // This is a level 2 span
        try {
            Parser parser = new Parser(TEST_STRING);
            Node node = getElementById(parser, "L2");
            if (node != null) {
                System.out.println(node.toPlainTextString());
            } else {
                System.out.println("null");

View Full Code Here


  @Override
  public List<UrlData> getUrlData() {
    final List<UrlData> dataList = new ArrayList<UrlData>();
    try {
      Parser parser = new Parser();
      parser.setURL(channelUrl);
      parser.setEncoding("Gb2312");
      NodeVisitor visitor = new NodeVisitor() {


        public void visitTag(Tag tag) {
          if (Div.class.equals(tag.getClass())
              && "m".equals(tag.getAttribute("class"))
              && "sortlist".equals(tag.getAttribute("id"))) {
            tag.accept(new NodeVisitor() {
              public void visitTag(Tag tag) {
                if (Div.class.equals(tag.getClass())
                    && "con".equals(tag
                        .getAttribute("class"))) {
                  tag.accept(new NodeVisitor() {
                    public void visitTag(Tag tag) {
                      UrlData data = new UrlData();
                      if (LinkTag.class.equals(tag
                          .getClass())) {
                        data.setUrlName(tag
                            .getAttribute("title"));
                        data.setUrl(tag
                            .getAttribute("href"));
                        try {
                          LoggerUtils
                              .log(DigitalUrlGetter.class
                                  .getName(),
                                  data.toString());
                        } catch (IOException e) {
                          // TODO Auto-generated catch
                          e.printStackTrace();
                        }
                        System.out.println(data);
                        dataList.add(data);
                      }
                    };
                  });
                }
              };
            });
          }
        }
      };
      parser.visitAllNodesWith(visitor);
    } catch (ParserException e) {
      e.printStackTrace();
    }
    return dataList;
  }

View Full Code Here

  }


  private List<Data> htmlParser() {
    final List<Data> dataList = new ArrayList<Data>();
    try {
      Parser parser = new Parser();
      parser.setURL(url);
      parser.setEncoding("Gb2312");
      NodeVisitor visitor = new NodeVisitor() {


        public void visitTag(Tag tag) {
          if (Div.class.equals(tag.getClass())
              && "p-img".equals(tag.getAttribute("class"))) {
            final Data data = new Data();
            tag.accept(new NodeVisitor() {
              public void visitTag(Tag tag) {
                if (ImageTag.class.equals(tag.getClass())) {
                  data.setName(tag.getAttribute("alt"));
                }
                if (LinkTag.class.equals(tag.getClass())) {
                  data.setUri(tag.getAttribute("href"));
                }
              };
            });
            getDataId(data);
            getPrice(data);
            try {
              LoggerUtils.log(CameraGetter.class.getName(),
                  data.toString());
            } catch (IOException e) {
              // TODO Auto-generated catch block
              e.printStackTrace();
            }
            dataList.add(data);
          }
        }
      };
      parser.visitAllNodesWith(visitor);
    } catch (ParserException e) {
      e.printStackTrace();
    }
    return dataList;
  }

View Full Code Here


        if (DEBUG) _log.debug("Parsing HTML data:\n" + htmlData);


        try
        {
            Parser parser = Parser.createParser(htmlData, null);


            NodeList heads = parser.parse(new TagNameFilter("HEAD"));
            if (heads.size() != 1)
                throw new DiscoveryException(
                        "HTML response must have exactly one HEAD element, " +
                                "found " + heads.size() + " : " + heads.toHtml());
            Node head = heads.elementAt(0);

View Full Code Here


   private int http(File rootDir) throws Exception {


      String s = loadFromUrl("http://" + BINGOBANKO_URL + "/print/?boardCount=9");


      Parser parser = new Parser(s);


      OrFilter filter = new OrFilter(new TagNameFilter("IMG"), new TagNameFilter("script"));


      if (bingoIndex == null) {
         bingoIndex = (int) ((System.currentTimeMillis() - 1317495600085l) / 604800000) + 40;

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.htmlparser.Parser

cn.edu.pku.dr.requirement.elicitation.tools.HtmlTransformer

com.gnizr.core.util.FormatUtil

com.knowgate.hipermail.DBMimeMessage

com.knowgate.hipermail.HtmlMimeBodyPart

com.lanyuan.util.HttpClientUtils

com.vgo.movie.thread.DetailFilmThread

com.waxayaz.TomcatMI.core.utils.repoManager.TomcatRepositoryManager

com.wordpress.util.StringUtil

fitnesse.fixtures.PageDriver

fitnesse.slim.converters.MapEditor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.