Package org.htmlparser.util

Examples of org.htmlparser.util.NodeList


    }
   
    private void parseCategory(Node categoryNode, List<Category> categories) throws ParserException {
      NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
     
      NodeList links = new NodeList();
      categoryNode.collectInto(links, linkFilter);
     
      if (links.size() == 0) {
        return;
      }
     
        Category parent = null;
      NodeIterator iter = links.elements();
        while (iter.hasMoreNodes()) {
          LinkTag link = (LinkTag)iter.nextNode();
          Category category = new Category();
          category.setTitle(NodeUtils.getTextData(link));
          category.setId(link.getLink().replaceAll(".*/movieGenres/(.*?)", "$1"));
View Full Code Here


            parser.extractAllNodesThatMatch(infoCollector);
           
            String selectedPage = NodeUtils.getTextData(infoCollector.getNode(0));
            if (selectedPage != null && selectedPage.equals(Integer.toString(page))) {
           
                NodeList browseNodes = infoCollector.getNodeList(1);
               
                if (browseNodes.size() > 1) {
                    NodeIterator iter = browseNodes.elements();
                   
                    while (iter.hasMoreNodes()) {
                        MovieItem item = parseMovieSearch(iter.nextNode());
                        if (item != null) {
                            items.add(item);
View Full Code Here

    public void parseHtml(URI uri, InputStream content)
    {
        ByteArrayOutputStream output = null//####TODO: Add in MIME type detection to this stream
        Parser                parser = null;
        NodeFilter            filter = null;
        NodeList              list   = null;
       
       
        if (uriFilter.isURIInternal(uri) && content != null)
        {
            log.debug("Parsing HTML from URI " + uri.toString());
       
            try
            {
                log.debug("Copying content.");
       
                output = new ByteArrayOutputStream();
                IOUtils.copy(content, output);
               
                log.debug("Creating filter.");
       
                //####TODO: Dependency inject this crap
                filter = new AndFilter(new NodeClassFilter(LinkTag.class),
                                       new NodeFilter()
                                       {
                                           public boolean accept(Node node)
                                           {
                                               return(!((LinkTag)node).isMailLink());
                                           }
                                       });
               
                log.debug("Creating parser.");
       
                parser = new Parser(output.toString());
               
                log.debug("Extracting all nodes that match the filter.");
       
                list = parser.extractAllNodesThatMatch(filter);
               
                log.debug("About to iterate through the matching nodes, count=" + list.size());
       
                for (int i = 0; i < list.size(); i++)
                {
                    log.debug("Loop=" + i);
       
                    Node                linkTagNode = list.elementAt(i);
                    String              linkTag     = linkTagNode.toHtml();
                    URI                 targetURI   = parseLinkTag(uri, linkTagNode);
                    Map<String, String> properties  = new HashMap<String, String>();
                   
                    properties.put("linkTagText", linkTag);
View Full Code Here

        return;
      }

      myKey.set(key.get());

      NodeList nl;
      try {
        // initialize HTML parser
        parser.setInputHTML(doc.getContent());

        // parse the document
        nl = parser.parse(filter);
      } catch (ParserException e) {
        context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
        myValue.setDocid(doc.getDocid());
        myValue.setContent("<DOC>\n<DOCNO>" + doc.getDocid() + "</DOCNO>\n<DOC>");
        context.write(myKey, myValue);
        return;
      } catch (StackOverflowError e) {
        context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
        myValue.setDocid(doc.getDocid());
        myValue.setContent("<DOC>\n<DOCNO>" + doc.getDocid() + "</DOCNO>\n<DOC>");
        context.write(myKey, myValue);
        return;
      }

      strBuf.setLength(0);
      strBuf.append("<DOC>\n<DOCNO>");
      strBuf.append(doc.getDocid());
      strBuf.append("</DOCNO>\n");

      for (int i = 0; i < nl.size(); i++) {
        strBuf.append(nl.elementAt(i).toHtml()).append("\n");
      }
      strBuf.append("</DOC>\n");

      // create output document
      myValue.setDocid(doc.getDocid());
View Full Code Here

        parser.setInputHTML(doc.getContent()); // initializing the
        // parser with new HTML
        // content

        // Setting base URL for the current document
        NodeList nl = parser.parse(null);
        BaseHrefTag baseTag = new BaseHrefTag();
        baseTag.setBaseUrl(base);
        nl.add(baseTag);

        // re-initializing the parser with the fixed content
        parser.setInputHTML(nl.toHtml());

        // listing all LinkTag nodes
        list = parser.extractAllNodesThatMatch(filter);
      }
      catch (ParserException e)
View Full Code Here

      try {
        // initializing the parser with new content
        parser.setInputHTML(doc.getContent());

        // Setting base URL for the current document
        NodeList nl = parser.parse(null);
        BaseHrefTag baseTag = new BaseHrefTag();
        baseTag.setBaseUrl(base);
        nl.add(baseTag);

        // re-initializing the parser with the correct content
        parser.setInputHTML(nl.toHtml());

        // listing all LinkTag nodes
        list = parser.extractAllNodesThatMatch(filter);
      } catch (ParserException e) {
        reporter.incrCounter(LinkCounter.PARSER_FAILED, 1);
View Full Code Here

     * @exception ParserException If the parse fails.
     */
    protected URL[] extractLinks () throws ParserException
    {
        NodeFilter filter;
        NodeList list;
        Vector vector;
        LinkTag link;
        URL[] ret;

        mParser.reset ();
        filter = new NodeClassFilter (LinkTag.class);
        try
        {
            list = mParser.extractAllNodesThatMatch (filter);
        }
        catch (EncodingChangeException ece)
        {
            mParser.reset ();
            list = mParser.extractAllNodesThatMatch (filter);
        }
        vector = new Vector();
        for (int i = 0; i < list.size (); i++)
            try
            {
                link = (LinkTag)list.elementAt (i);
                vector.add(new URL (link.getLink ()));
            }
            catch (MalformedURLException murle)
            {
                //vector.remove (i);
View Full Code Here

     * Assign the <code>Nodes</code> property, firing the property change.
     * @param nodes The new value of the <code>Nodes</code> property.
     */
    protected void updateNodes (NodeList nodes)
    {
        NodeList oldValue;
        String oldText;
        String newText;

        if ((null == mNodes) || !mNodes.equals (nodes))
        {
View Full Code Here

    protected NodeList applyFilters ()
        throws
            ParserException
    {
        NodeFilter[] filters;
        NodeList ret;

        ret = mParser.parse (null);
        filters = getFilters ();
        if (null != filters)
            for (int i = 0; i < filters.length; i++)
                ret = ret.extractAllNodesThatMatch (filters[i], mRecursive);

        return (ret);
    }
View Full Code Here

     * Fetch the URL contents and filter it.
     * Only do work if there is a valid parser with it's URL set.
     */
    protected void setNodes ()
    {
        NodeList list;

        if (null != getURL ())
            try
            {
                list = applyFilters ();
                updateNodes (list);
            }
            catch (EncodingChangeException ece)
            {
                try
                {   // try again with the encoding now in force
                    mParser.reset ();
                    list = applyFilters ();
                    updateNodes (list);
                }
                catch (ParserException pe)
                {
                    updateNodes (new NodeList ());
                }
             }
            catch (ParserException pe)
            {
                updateNodes (new NodeList ());
            }
    }
View Full Code Here

TOP

Related Classes of org.htmlparser.util.NodeList

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.