Package org.htmlparser

Examples of org.htmlparser.Parser


            throws HTMLParseException {
        if (log.isDebugEnabled()) {
            log.debug("Parsing html of: " + html);
        }

        Parser htmlParser = null;
        try {
            htmlParser = new Parser();
            htmlParser.setInputHTML(html);
        } catch (Exception e) {
            throw new HTMLParseException(e);
        }

        // Now parse the DOM tree
        try {
            // we start to iterate through the elements
            parseNodes(htmlParser.elements(), formEncodings, pageEncoding);
            log.debug("End   : parseNodes");
        } catch (ParserException e) {
            throw new HTMLParseException(e);
        }
    }
View Full Code Here


        if (log.isDebugEnabled()) {
            log.debug("Parsing html of: " + baseUrl);
        }

        Parser htmlParser = null;
        try {
            String contents = new String(html);
            htmlParser = new Parser();
            htmlParser.setInputHTML(contents);
        } catch (Exception e) {
            throw new HTMLParseException(e);
        }

        // Now parse the DOM tree
        try {
            // we start to iterate through the elements
            parseNodes(htmlParser.elements(), new URLPointer(baseUrl), urls);
            log.debug("End   : parseNodes");
        } catch (ParserException e) {
            throw new HTMLParseException(e);
        }
View Full Code Here

    localParent.setCursor(Cursor.getPredefinedCursor(Cursor.DEFAULT_CURSOR));

 

  private void trouverLiens() throws Exception{
    Parser parser;
    NodeList listUrlNode;
    NodeList listTitleNode;
    String tmpString;
    String tmpLien;
    String tmpValeur;
    LinkedList<HashMap> listUrl = new LinkedList<HashMap>();
    HashMap<String, String> tmpHashMap;
    String tmpRecherche;
    int i=0;

    try {
      tmpRecherche = "http://www.imdb.com/find?s=tt&q=" + localParent.getJValeurNom().getText().replaceAll(" ", "+");

      parser = new Parser(tmpRecherche);

      listTitleNode = parser.extractAllNodesThatMatch(new NodeClassFilter (TitleTag.class));
      //On commence par d�tecter le titre
      for(i = 0;i < listTitleNode.size(); i++){
        tmpString = listTitleNode.elementAt(i).toHtml();
        if(tmpString.indexOf("IMDb Title  Search") <= -1){
          urlFilm = tmpRecherche;
        }
      }

      if(urlFilm == null){
        parser.reset();
        parser.setResource(tmpRecherche);       
        listUrlNode = parser.extractAllNodesThatMatch (new NodeClassFilter (LinkTag.class));
        //On d�tecte d'abord les liens
        for (i = 0; i < listUrlNode.size(); i++){
          tmpString = listUrlNode.elementAt(i).toHtml();
          tmpLien = tmpString.substring(tmpString.indexOf("<a href=")+9, tmpString.indexOf(">")-1);
          if(tmpLien.startsWith("/title/")){
View Full Code Here

   

  }

  private void recupererInfoFilm() throws Exception{
    Parser divParser;
    NodeList listDiv;
    Node tmpNode; 

    boolean acteursVu = false;
    boolean titreVu = false;
    boolean realisateurVu = false;
    boolean paysVu = false;
    boolean tempsVu = false;
    boolean genreVu = false;
    boolean awardsVu = false;
    boolean dateSortieVue = false;
    boolean imageVues = false;
    int indexDeb = 0;
    int indexFin = 0;


    try{
      divParser = new Parser(urlFilm);
      listDiv = divParser.extractAllNodesThatMatch (new NodeClassFilter (Div.class));
      for (int i = 0; i < listDiv.size(); i++){
        NodeList tmpDivNodeContent = listDiv.elementAt(i).getChildren();

        if(tmpDivNodeContent != null){
          for(int j = 0; j< tmpDivNodeContent.size(); j++){
View Full Code Here

        if (DEBUG) _log.debug("Parsing HTML data:\n" + htmlData);

        try
        {
            Parser parser = Parser.createParser(htmlData, null);

            NodeList heads = parser.parse(new TagNameFilter("HEAD"));
            if (heads.size() != 1)
                throw new DiscoveryException(
                        "HTML response must have exactly one HEAD element, " +
                                "found " + heads.size() + " : " + heads.toHtml());
            Node head = heads.elementAt(0);
View Full Code Here

            // parse and extract the needed info
            if (bytesRead <= 0)
                throw new YadisException("No data read from the HTML message",
                        YadisResult.HTMLMETA_DOWNLOAD_ERROR);

            Parser parser = Parser.createParser(new String(data, 0, bytesRead), null);
            NodeList heads = parser.parse(new TagNameFilter("HEAD"));

            if (heads.size() != 1)
                throw new YadisException(
                        "HTML response must have exactly one HEAD element, " +
                                "found " + heads.size() + " : "
View Full Code Here

    // find nodes
    Node[] tags;
    {
      String htmlContents = IOUtils2.readString(htmlFile);
      Lexer lexer = new Lexer(new Page(htmlContents));
      Parser parser = new Parser(lexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
      TagFindingVisitor visitor = new TagFindingVisitor(new String[]{tagName});
      parser.visitAllNodesWith(visitor);
      tags = visitor.getTags(0);
    }
    // convert into List<TagNode>
    List<TagNode> tagNodes = Lists.newArrayList();
    CollectionUtils.addAll(tagNodes, tags);
View Full Code Here

     * @see net.sf.sitstart.svc.responseparser.ResponseParser#parseHtml(java.net.URI, java.io.InputStream)
     */
    public void parseHtml(URI uri, InputStream content)
    {
        ByteArrayOutputStream output = null//####TODO: Add in MIME type detection to this stream
        Parser                parser = null;
        NodeFilter            filter = null;
        NodeList              list   = null;
       
       
        if (uriFilter.isURIInternal(uri) && content != null)
        {
            log.debug("Parsing HTML from URI " + uri.toString());
       
            try
            {
                log.debug("Copying content.");
       
                output = new ByteArrayOutputStream();
                IOUtils.copy(content, output);
               
                log.debug("Creating filter.");
       
                //####TODO: Dependency inject this crap
                filter = new AndFilter(new NodeClassFilter(LinkTag.class),
                                       new NodeFilter()
                                       {
                                           public boolean accept(Node node)
                                           {
                                               return(!((LinkTag)node).isMailLink());
                                           }
                                       });
               
                log.debug("Creating parser.");
       
                parser = new Parser(output.toString());
               
                log.debug("Extracting all nodes that match the filter.");
       
                list = parser.extractAllNodesThatMatch(filter);
               
                log.debug("About to iterate through the matching nodes, count=" + list.size());
       
                for (int i = 0; i < list.size(); i++)
                {
View Full Code Here

     * @see org.apache.jmeter.protocol.http.parser.HtmlParser#getEmbeddedResourceURLs(byte[], java.net.URL)
     */
    public Iterator getEmbeddedResourceURLs(byte[] html, URL baseUrl, URLCollection urls)
        throws HTMLParseException
    {
        Parser htmlParser= null;
        try
        {
            String contents= new String(html);
            StringReader reader= new StringReader(contents);
            NodeReader nreader= new NodeReader(reader, contents.length());
            htmlParser= new Parser(nreader, new DefaultParserFeedback());
            addTagListeners(htmlParser);
        }
        catch (Exception e)
        {
            throw new HTMLParseException(e);
        }

        // Now parse the DOM tree

        // look for applets

        // This will only work with an Applet .class file.
        // Ideally, this should be upgraded to work with Objects (IE)
        //  and archives (.jar and .zip) files as well.

        try
        {
            // we start to iterate through the elements
            for (NodeIterator e= htmlParser.elements(); e.hasMoreNodes();)
            {
                Node node= e.nextNode();
                String binUrlStr= null;

                // first we check to see if body tag has a
View Full Code Here

    /** Creates new LinkBean */
    public LinkBean ()
    {
        mPropertySupport = new PropertyChangeSupport (this);
        mLinks = null;
        mParser = new Parser ();
    }
View Full Code Here

TOP

Related Classes of org.htmlparser.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.