// extract links
try {
if (doc.isHTML() && (depth > 0)) {
// solving encoding problem
// HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
HtmlDocument htmlDoc = null;
HttpHeader contentTypeHeader = doc.getHeader("Content-type");
if (contentTypeHeader != null) {
String contentType = contentTypeHeader.getValue();
int index = contentType.toLowerCase().indexOf("charset=");
if (index > 0) {
htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8));
} else {
htmlDoc = new HtmlDocument(u, doc.getContent());
}
} else {
htmlDoc = new HtmlDocument(u, doc.getContent());
}
// add links
// this depth-check is critical!
// otherwise far too many RobotTasks will be created
// this will cause a premature OutOfMemoryException!
if (depth > 0) {
if (duplicate != null) {
HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
doc.setLinks(linksDoc.getLinks());
} else if (cached) {
}
if (links == null) {
links = htmlDoc.getLinks();
doc.setLinks(links);
}
if (duplicate == null) {
HashSet checkedLinks = new HashSet();
for (int i = 0; i < links.size(); i++) {
URL link = (URL) links.elementAt(i);
log.info("Link: "+link);
// check already here for duplicate links to avoid expensive
// creation of RobotTasks
if (!checkedLinks.contains(link)) {
checkedLinks.add(link);
String myReferer = u.toString();
if (u.getUserInfo() != null) {
// remove userinfo from referer
int endindex = myReferer.indexOf("@")+1;
myReferer = "http://"+ myReferer.substring(endindex);
}
RobotTask newTask = createRobotTask((URL) links.elementAt(i), depth - 1, myReferer);
// check already here for visited tasks to save memory
if (!visited.contains(newTask)) {
// bad workaround to retrieve images first
if (newTask.urlString.endsWith(".jpg")) {
addTaskAtStart(newTask);
} else {
addTask(newTask);
}
}
}
}
}
}
if (hasFormHandlers) {
// add forms
Vector forms = htmlDoc.getElements("form");
for (int i = 0; i < forms.size(); i++) {
ExtendedURL eurl = formFiller.fillForm(u, (Element) forms.elementAt(i));
if (eurl != null) {
RobotTask newTask = createRobotTask(eurl.getURL(), depth - 1, u.toString());
newTask.setParamString(eurl.getParams());