/**
*
*/
package modelcrawler;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
/**
* @author Niall
*
*/
public class Page {
private Crawler _crawler = null;
private URL _url = null;
private String _content = null;
private List<Page> _links = null;
private List<String> _domains = null;
private List<String> _keywords = null;
private Date _lastDownloaded = null;
private int _downloadAttempts = 0;
private int _indexAttempts = 0;
/**
*
*/
public Page(String url) {
try {
_url = new URL(url);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
_links = new ArrayList<Page>();
_domains = new ArrayList<String>();
_keywords = new ArrayList<String>();
}
public Page(Crawler c, String url) {
_crawler = c;
try {
_url = new URL(url);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
_links = new ArrayList<Page>();
_domains = new ArrayList<String>();
_keywords = new ArrayList<String>();
}
public boolean download() {
_downloadAttempts++;
try {
Parser par = new Parser(_url.toString());
Parser.getConnectionManager().setRedirectionProcessingEnabled(true);
Parser.getConnectionManager().setCookieProcessingEnabled(true);
StringBuffer sb = new StringBuffer();
for (NodeIterator e = par.elements(); e.hasMoreNodes();) {
try {
String s = e.nextNode().toPlainTextString();
sb.append(s);
} catch (EncodingChangeException ece) {
}
}
_lastDownloaded = new Date();
System.out.println("Page Downloaded: " + _url.toString());
setContent(sb.toString());
par.reset();
_links.clear();
NodeFilter filter = new OrFilter( new NodeFilter[] {
new TagNameFilter("A"),
new TagNameFilter("META")
}
);
NodeList list = par.parse( filter );
System.out.println("Url: " + _url.toString() + " found " + list.size() + " items.");
LinkTag lt = null;
for (SimpleNodeIterator e = list.elements (); e.hasMoreNodes (); ) {
Node n = e.nextNode();
if (n.getClass() == LinkTag.class ) {
lt = (LinkTag)n;
if ( FileTypes.isAllowedFileType(lt.getLink())) {
String url = par.getLexer().getPage().getAbsoluteURL(lt.getLink());
URL c;
try {
c = new URL(url);
Page p = new Page(c.toString());
if ( _crawler != null ) {
_crawler.addForDownload(p);
}
_links.add(p);
addDomain(c.toString().replaceAll(c.getPath(), ""));
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
return false;
}
}
/*
String[] kws = lt.getLinkText().split(" ");
for( String kw : kws ) {
if ( !kw.isEmpty() ) {
_keywords.add(kw);
}
}
*/
} else if ( n.getClass() == MetaTag.class ) {
MetaTag t = (MetaTag)n;
if (t.getAttribute("name") != null && t.getAttribute("name").equals("keywords") ) {
String[] kws = t.getAttribute("content").split(",");
for( String kw : kws ) {
_keywords.add(kw);
}
}
}
}
return true;
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
}
private void addDomain(String domain) {
if ( !_domains.contains(domain) && domain.indexOf(getUrlDomain()) == -1 ) {
_domains.add(domain);
} else {
System.out.println("Duplicate Domain excluded from list");
}
}
public List<Page> getPages() {
return _links;
}
public List<String> getDomains() {
return _domains;
}
public String getUrlDomain() {
return _url.toString().replaceAll(_url.getPath(), "");
}
public void setContent(String _content) {
this._content = _content;
}
public String getContent() {
return _content;
}
public String getIndex() {
StringBuilder sb = new StringBuilder();
for( String s : _keywords ) {
sb.append(s + ",");
}
return sb.toString();
}
@Override
public String toString() {
return _url.toString();
}
public Date getLastDownloaded() {
return _lastDownloaded;
}
@Override
public boolean equals( Object o ) {
Page p = (Page)o;
return this._url.toString().equals(p.toString());
}
public boolean hasContent() {
return !(_content == null || _content.isEmpty());
}
public String getKeywords() {
return _keywords.toString();
}
public int getIndexAttempts() {
return _indexAttempts;
}
}