Examples of org.htmlparser.Parser

org.htmlparser.Parser

ahoo.com",new DefaultHTMLParserFeedback()); // In this example, we are registering all the common scanners parser.registerScanners(); for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode(); node.print(); } Below is some sample code to parse Yahoo.com and print only the text information. This scanning will run faster, as there are no scanners registered here.

 Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); // In this example, none of the scanners need to be registered // as a string node is not a tag to be scanned for. for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode(); if (node instanceof StringNode) {        	 StringNode stringNode = (StringNode)node;         System.out.println(stringNode.getText());     }  }

The above snippet will print out only the text contents in the html document.
Here's another snippet that will only print out the link urls in a document. This is an example of adding a link scanner.

 Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); parser.addScanner(new LinkScanner("-l")); for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode();     if (node instanceof LinkTag) { LinkTag linkTag = (LinkTag)node;         System.out.println(linkTag.getLink());     }  }

@see Parser#elements()

            throws HTMLParseException {
        if (log.isDebugEnabled()) {
            log.debug("Parsing html of: " + html);
        }


        Parser htmlParser = null;
        try {
            htmlParser = new Parser();
            htmlParser.setInputHTML(html);
        } catch (Exception e) {
            throw new HTMLParseException(e);
        }


        // Now parse the DOM tree
        try {
            // we start to iterate through the elements
            parseNodes(htmlParser.elements(), formEncodings, pageEncoding);
            log.debug("End   : parseNodes");
        } catch (ParserException e) {
            throw new HTMLParseException(e);
        }
    }

View Full Code Here


        if (log.isDebugEnabled()) {
            log.debug("Parsing html of: " + baseUrl);
        }


        Parser htmlParser = null;
        try {
            String contents = new String(html);
            htmlParser = new Parser();
            htmlParser.setInputHTML(contents);
        } catch (Exception e) {
            throw new HTMLParseException(e);
        }


        // Now parse the DOM tree
        try {
            // we start to iterate through the elements
            parseNodes(htmlParser.elements(), new URLPointer(baseUrl), urls);
            log.debug("End   : parseNodes");
        } catch (ParserException e) {
            throw new HTMLParseException(e);
        }

View Full Code Here

    localParent.setCursor(Cursor.getPredefinedCursor(Cursor.DEFAULT_CURSOR));


  }  


  private void trouverLiens() throws Exception{
    Parser parser;
    NodeList listUrlNode;
    NodeList listTitleNode;
    String tmpString;
    String tmpLien;
    String tmpValeur;
    LinkedList<HashMap> listUrl = new LinkedList<HashMap>();
    HashMap<String, String> tmpHashMap;
    String tmpRecherche;
    int i=0;


    try {
      tmpRecherche = "http://www.imdb.com/find?s=tt&q=" + localParent.getJValeurNom().getText().replaceAll(" ", "+");


      parser = new Parser(tmpRecherche);


      listTitleNode = parser.extractAllNodesThatMatch(new NodeClassFilter (TitleTag.class));
      //On commence par d�tecter le titre
      for(i = 0;i < listTitleNode.size(); i++){
        tmpString = listTitleNode.elementAt(i).toHtml();
        if(tmpString.indexOf("IMDb Title  Search") <= -1){
          urlFilm = tmpRecherche;
        }
      }


      if(urlFilm == null){
        parser.reset();
        parser.setResource(tmpRecherche);        
        listUrlNode = parser.extractAllNodesThatMatch (new NodeClassFilter (LinkTag.class));
        //On d�tecte d'abord les liens
        for (i = 0; i < listUrlNode.size(); i++){
          tmpString = listUrlNode.elementAt(i).toHtml();
          tmpLien = tmpString.substring(tmpString.indexOf("<a href=")+9, tmpString.indexOf(">")-1);
          if(tmpLien.startsWith("/title/")){

View Full Code Here

    }  


  }


  private void recupererInfoFilm() throws Exception{
    Parser divParser;
    NodeList listDiv;
    Node tmpNode;  


    boolean acteursVu = false;
    boolean titreVu = false;
    boolean realisateurVu = false;
    boolean paysVu = false;
    boolean tempsVu = false;
    boolean genreVu = false;
    boolean awardsVu = false;
    boolean dateSortieVue = false;
    boolean imageVues = false;
    int indexDeb = 0;
    int indexFin = 0;




    try{
      divParser = new Parser(urlFilm);
      listDiv = divParser.extractAllNodesThatMatch (new NodeClassFilter (Div.class));
      for (int i = 0; i < listDiv.size(); i++){
        NodeList tmpDivNodeContent = listDiv.elementAt(i).getChildren();


        if(tmpDivNodeContent != null){
          for(int j = 0; j< tmpDivNodeContent.size(); j++){

View Full Code Here


        if (DEBUG) _log.debug("Parsing HTML data:\n" + htmlData);


        try
        {
            Parser parser = Parser.createParser(htmlData, null);


            NodeList heads = parser.parse(new TagNameFilter("HEAD"));
            if (heads.size() != 1)
                throw new DiscoveryException(
                        "HTML response must have exactly one HEAD element, " +
                                "found " + heads.size() + " : " + heads.toHtml());
            Node head = heads.elementAt(0);

View Full Code Here

            // parse and extract the needed info
            if (bytesRead <= 0)
                throw new YadisException("No data read from the HTML message",
                        YadisResult.HTMLMETA_DOWNLOAD_ERROR);


            Parser parser = Parser.createParser(new String(data, 0, bytesRead), null);
            NodeList heads = parser.parse(new TagNameFilter("HEAD"));


            if (heads.size() != 1)
                throw new YadisException(
                        "HTML response must have exactly one HEAD element, " +
                                "found " + heads.size() + " : "

View Full Code Here

    // find nodes
    Node[] tags;
    {
      String htmlContents = IOUtils2.readString(htmlFile);
      Lexer lexer = new Lexer(new Page(htmlContents));
      Parser parser = new Parser(lexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
      TagFindingVisitor visitor = new TagFindingVisitor(new String[]{tagName});
      parser.visitAllNodesWith(visitor);
      tags = visitor.getTags(0);
    }
    // convert into List<TagNode>
    List<TagNode> tagNodes = Lists.newArrayList();
    CollectionUtils.addAll(tagNodes, tags);

View Full Code Here

     * @see net.sf.sitstart.svc.responseparser.ResponseParser#parseHtml(java.net.URI, java.io.InputStream)
     */
    public void parseHtml(URI uri, InputStream content)
    {
        ByteArrayOutputStream output = null;  //####TODO: Add in MIME type detection to this stream
        Parser                parser = null;
        NodeFilter            filter = null;
        NodeList              list   = null;
        
        
        if (uriFilter.isURIInternal(uri) && content != null)
        {
            log.debug("Parsing HTML from URI " + uri.toString());
        
            try
            {
                log.debug("Copying content.");
        
                output = new ByteArrayOutputStream();
                IOUtils.copy(content, output);
                
                log.debug("Creating filter.");
        
                //####TODO: Dependency inject this crap
                filter = new AndFilter(new NodeClassFilter(LinkTag.class),
                                       new NodeFilter()
                                       {
                                           public boolean accept(Node node)
                                           {
                                               return(!((LinkTag)node).isMailLink());
                                           }
                                       });
                
                log.debug("Creating parser.");
        
                parser = new Parser(output.toString());
                
                log.debug("Extracting all nodes that match the filter.");
        
                list = parser.extractAllNodesThatMatch(filter);
                
                log.debug("About to iterate through the matching nodes, count=" + list.size());
        
                for (int i = 0; i < list.size(); i++)
                {

View Full Code Here

     * @see org.apache.jmeter.protocol.http.parser.HtmlParser#getEmbeddedResourceURLs(byte[], java.net.URL)
     */
    public Iterator getEmbeddedResourceURLs(byte[] html, URL baseUrl, URLCollection urls)
        throws HTMLParseException
    {
        Parser htmlParser= null;
        try
        {
            String contents= new String(html);
            StringReader reader= new StringReader(contents);
            NodeReader nreader= new NodeReader(reader, contents.length());
            htmlParser= new Parser(nreader, new DefaultParserFeedback());
            addTagListeners(htmlParser);
        }
        catch (Exception e)
        {
            throw new HTMLParseException(e);
        }


        // Now parse the DOM tree


        // look for applets


        // This will only work with an Applet .class file.
        // Ideally, this should be upgraded to work with Objects (IE)
        //  and archives (.jar and .zip) files as well.


        try
        {
            // we start to iterate through the elements
            for (NodeIterator e= htmlParser.elements(); e.hasMoreNodes();)
            {
                Node node= e.nextNode();
                String binUrlStr= null;


                // first we check to see if body tag has a

View Full Code Here

    /** Creates new LinkBean */
    public LinkBean ()
    {
        mPropertySupport = new PropertyChangeSupport (this);
        mLinks = null;
        mParser = new Parser ();
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.htmlparser.Parser

cn.edu.pku.dr.requirement.elicitation.tools.HtmlTransformer

com.gnizr.core.util.FormatUtil

com.knowgate.hipermail.DBMimeMessage

com.knowgate.hipermail.HtmlMimeBodyPart

com.lanyuan.util.HttpClientUtils

com.vgo.movie.thread.DetailFilmThread

com.waxayaz.TomcatMI.core.utils.repoManager.TomcatRepositoryManager

com.wordpress.util.StringUtil

fitnesse.fixtures.PageDriver

fitnesse.slim.converters.MapEditor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.