Package modelcrawler

Source Code of modelcrawler.Page

/**
*
*/
package modelcrawler;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;

/**
* @author Niall
*
*/
public class Page {

  private Crawler _crawler = null;
 
  private URL _url = null;
  private String _content = null;
  private List<Page> _links = null;
  private List<String> _domains = null;
  private List<String> _keywords = null;
 
  private Date _lastDownloaded = null;
  private int _downloadAttempts = 0;
  private int _indexAttempts = 0;
 
  /**
   *
   */
  public Page(String url) {
    try {
      _url = new URL(url);
    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    _links = new ArrayList<Page>();
    _domains = new ArrayList<String>();
    _keywords = new ArrayList<String>();
  }
 
  public Page(Crawler c, String url) {
    _crawler = c;
    try {
      _url = new URL(url);
    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    _links = new ArrayList<Page>();
    _domains = new ArrayList<String>();
    _keywords = new ArrayList<String>();
  }
 

  public boolean download() {
    _downloadAttempts++;
    try {
      Parser par = new Parser(_url.toString());
      Parser.getConnectionManager().setRedirectionProcessingEnabled(true);
      Parser.getConnectionManager().setCookieProcessingEnabled(true);
     
      StringBuffer sb = new StringBuffer();
      for (NodeIterator e = par.elements(); e.hasMoreNodes();) {
        try {
          String s = e.nextNode().toPlainTextString();
          sb.append(s);
        } catch (EncodingChangeException ece) {
         
        }
      }
      _lastDownloaded = new Date();
     
      System.out.println("Page Downloaded: " + _url.toString());
     
      setContent(sb.toString());
     
      par.reset();
      _links.clear();
     
      NodeFilter filter = new OrFilter( new NodeFilter[] {
          new TagNameFilter("A"),
          new TagNameFilter("META")
          }
      );
      NodeList list = par.parse( filter );
      System.out.println("Url: " + _url.toString() + " found " + list.size() + " items.");
      LinkTag lt = null;
      for (SimpleNodeIterator e = list.elements (); e.hasMoreNodes (); ) {
        Node n = e.nextNode();
        if (n.getClass() == LinkTag.class ) {
          lt = (LinkTag)n;
          if ( FileTypes.isAllowedFileType(lt.getLink())) {
            String url = par.getLexer().getPage().getAbsoluteURL(lt.getLink());
            URL c;
            try {
              c = new URL(url);
              Page p = new Page(c.toString());
              if ( _crawler != null ) {
                _crawler.addForDownload(p);
              }
              _links.add(p);
              addDomain(c.toString().replaceAll(c.getPath(), ""));
            } catch (MalformedURLException e1) {
              // TODO Auto-generated catch block
              e1.printStackTrace();
              return false;
            }
          }
          /*
          String[] kws = lt.getLinkText().split(" ");
          for( String kw : kws ) {
            if ( !kw.isEmpty() ) {
              _keywords.add(kw);
            }
          }
          */
        } else if ( n.getClass() == MetaTag.class ) {
          MetaTag t = (MetaTag)n;
          if (t.getAttribute("name") != null && t.getAttribute("name").equals("keywords") ) {
            String[] kws = t.getAttribute("content").split(",");
            for( String kw : kws ) {
              _keywords.add(kw);
            }
          }
        }
      }
      return true;
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      return false;
    }
  }
 
  private void addDomain(String domain) {
    if ( !_domains.contains(domain) && domain.indexOf(getUrlDomain()) == -1 ) {
      _domains.add(domain);
    } else {
      System.out.println("Duplicate Domain excluded from list");
    }
  }
  public List<Page> getPages() {
    return _links;
  }
 
  public List<String> getDomains() {
    return _domains;
  }
 
  public String getUrlDomain() {
    return _url.toString().replaceAll(_url.getPath(), "");
  }
 
  public void setContent(String _content) {
    this._content = _content;
  }

  public String getContent() {
    return _content;
  }

  public String getIndex() {
    StringBuilder sb = new StringBuilder();
    for( String s : _keywords ) {
      sb.append(s + ",");
    }
    return sb.toString();
  }
  @Override
  public String toString() {
    return _url.toString();
  }

  public Date getLastDownloaded() {
    return _lastDownloaded;
  }
 
  @Override
  public boolean equals( Object o ) {
    Page p = (Page)o;
    return this._url.toString().equals(p.toString());
  }

  public boolean hasContent() {
    return !(_content == null || _content.isEmpty());
  }

  public String getKeywords() {
    return _keywords.toString();
  }

  public int getIndexAttempts() {
    return _indexAttempts;
  }
}
TOP

Related Classes of modelcrawler.Page

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.