long now = System.currentTimeMillis();
updateProgressInfo();
URL u = task.getUrl();
String urlString = u.toString();
String referer = task.getReferer();
int depth = task.getMaxDepth();
if (depth < 0) {
log.info("Max search depth reached");
return;
}
// we may need this additional check even if we
// tested it during adding to the tasks list
if (!isAllowed(u)) {
log.info("Url '" + u + "' filtered out.");
return;
}
if (u.getFile().equals("")) {
try {
urlString = urlString + "/";
u = new URL(urlString);
// fix for double retrieved files
task.setUrl(u);
} catch (MalformedURLException e) {
log.error("URL not well formed: " + e.toString());
// use exception handler to handle exception
exceptionHandler.handleException(this, u, e);
return;
}
}
log.info("retrieving " + urlString);
httpTool.setReferer(referer);
HttpDoc doc = null;
Vector links = null;
boolean cached = false;
// look in the cache first, but only for static pages
boolean reScan = true;
if ((docManager != null && allowCaching)
&& (task.getMethod() == HttpConstants.GET)
&& (task.getParamString() == null)) {
doc = docManager.retrieveFromCache(u);
/* if (doc != null) {
try {
links = ((UrlCollector) docManager).retrieveLinks(doc);
} catch (IOException e) {
log.info("Could not get links for " + u + ": " + e.getMessage());
links = null;
}
}*/
if (doc != null) {
countCache++;
long lastRetrieved = doc.getDateAsMilliSeconds();
double ageInSeconds = (now - lastRetrieved) / 1000;
if (ageInSeconds < 0) {
log.warn("DocumentAge < 0!");
}
reScan = maxDocumentAge >= 0 && ageInSeconds > maxDocumentAge;
if (reScan) {
long lastModified = doc.getLastModifiedAsMilliSeconds();
Date lastModifiedDate = new Date(lastModified);
httpTool.setIfModifiedSince(lastModifiedDate);
}
} else {
httpTool.setIfModifiedSince(null);
}
}
// if not found in cache, retrieve from the web page
if (reScan) {
HttpDoc newDoc;
boolean error = false;
try {
if (u.getProtocol().equalsIgnoreCase("file")) {
// retrieve from file
newDoc = retrieveFileURL(u, httpTool.getIfModifiedSince());
} else {
// retrieve from Web
newDoc = httpTool.retrieveDocument(u, task.getMethod(), task.getParamString());
if (newDoc != null) {
newDoc.setDate(now);
}
sleepNow();
}
if (newDoc!= null && !newDoc.isNotModified()) {
if (!(newDoc.isOk() || newDoc.isRedirect())) {
error = true;
}
} else {
// (newDoc == null || newDoc.isNotModified()) && doc != null
// -> Not modified
// -> refresh time stamp
if (doc != null) {
doc.setDate(now);
doc.setCached(false);
newDoc = null;
}
}
} catch (HttpException hex) {
error = true; newDoc = null;
}
if (error) {
int retry = task.retry();
if (retry <= maxRetries) {
synchronized(visited) {
todo.add(task);
visited.remove(task);
}
log.info("Adding " + u + " for retry no. " + retry);
return;
} else {
doc = docManager.retrieveFromCache(u);
if (doc == null) {
log.warn("Unsuccessfull retries for " + u);
return;
} else {
long docDate = doc.getDateAsMilliSeconds();
long age = (now - docDate);
age /= 1000;
if (expirationAge < 0 || age < expirationAge) {
newDoc = doc;
cached = true;
log.info("Cached document not expired: " + u);
} else {
log.warn("Cached document expired: " + u);
docManager.removeDocument(u);
return;
}
}
}
}
if (newDoc != null) {
countWeb++;
doc = newDoc;
links = null; // force recalculation of links
countRefresh++;
} else {
cached = true;
countNoRefresh++;
}
} else {
cached = true;
log.debug("Page " + u + " retrieved from cache");
}
// Add it to the visited vector
// needs to be synchronized with todo-list
// visited.add(task);
// got a NULL document, that doc was not retrieved
// usually, it was not downloaded because a rule didn't allow
// to download it
if (doc == null) {
log.info("not downloaded " + u);
return;
}
// Duplicate check
String duplicate=null;
if (duplicateCheck) {
duplicate = getContentVisitedURL(doc);
if (duplicate != null) {
log.info("URLs with same content found: " + urlString + " = " + duplicate);
} else {
try {
duplicate = docManager.findDuplicate(doc);
if (duplicate != null) {
log.info("URLs with same content found in cache: " + urlString + " = " + duplicate);
}
} catch (IOException e) {
e.printStackTrace();
}
}
if (duplicate != null) {
String pureDuplicate = removeParameters(duplicate);
String pureUrl = removeParameters(urlString);
if (!pureUrl.equals(pureDuplicate) && !cached) {
// different url not yet stored -> store it
try {
// retrieve links from original
HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
if (linksDoc != null) {
doc.setLinks(linksDoc.getLinks());
}
docManager.storeDocument(doc);
} catch (Exception e) {
e.printStackTrace();
}
}
RobotTask newTask;
try {
newTask = createRobotTask(new URL(duplicate), depth, referer);
// check already here for visited tasks to save memory
if (!visited.contains(newTask)) {
addTask(newTask);
}
} catch (MalformedURLException e) {
e.printStackTrace(); // Can�t happen
}
return;
}
}
// was it an UnAuthorized document ?
if (doc.isUnauthorized()) {
log.info("got HTTP Unauthorized for URL " + u);
}
if (doc.isOk() || cached) {
// callback
if (webRobotCallback != null) {
int contentLength=0;
if (doc.getContent() != null) { contentLength=doc.getContent().length; }
webRobotCallback.webRobotRetrievedDoc(urlString, contentLength);
}
// extract links
try {
if (doc.isHTML() && (depth > 0)) {
// solving encoding problem
// HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
HtmlDocument htmlDoc = null;
HttpHeader contentTypeHeader = doc.getHeader("Content-type");
if (contentTypeHeader != null) {
String contentType = contentTypeHeader.getValue();
int index = contentType.toLowerCase().indexOf("charset=");
if (index > 0) {
htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8));
} else {
htmlDoc = new HtmlDocument(u, doc.getContent());
}
} else {
htmlDoc = new HtmlDocument(u, doc.getContent());
}
// add links
// this depth-check is critical!
// otherwise far too many RobotTasks will be created
// this will cause a premature OutOfMemoryException!
if (depth > 0) {
if (duplicate != null) {
HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
doc.setLinks(linksDoc.getLinks());
} else if (cached) {
}
if (links == null) {
links = htmlDoc.getLinks();
doc.setLinks(links);
}
if (duplicate == null) {
HashSet checkedLinks = new HashSet();
for (int i = 0; i < links.size(); i++) {
URL link = (URL) links.elementAt(i);
log.info("Link: "+link);
// check already here for duplicate links to avoid expensive
// creation of RobotTasks
if (!checkedLinks.contains(link)) {
checkedLinks.add(link);
String myReferer = u.toString();
if (u.getUserInfo() != null) {
// remove userinfo from referer
int endindex = myReferer.indexOf("@")+1;
myReferer = "http://"+ myReferer.substring(endindex);
}
RobotTask newTask = createRobotTask((URL) links.elementAt(i), depth - 1, myReferer);
// check already here for visited tasks to save memory
if (!visited.contains(newTask)) {
// bad workaround to retrieve images first
if (newTask.urlString.endsWith(".jpg")) {
addTaskAtStart(newTask);
} else {
addTask(newTask);
}
}
}
}
}
}
if (hasFormHandlers) {
// add forms
Vector forms = htmlDoc.getElements("form");
for (int i = 0; i < forms.size(); i++) {
ExtendedURL eurl = formFiller.fillForm(u, (Element) forms.elementAt(i));
if (eurl != null) {
RobotTask newTask = createRobotTask(eurl.getURL(), depth - 1, u.toString());
newTask.setParamString(eurl.getParams());
newTask.setMethod(eurl.getRequestMethod());
addTask(newTask);
}
}
}
}
// catch any occuring error to keep on processing
} catch (OutOfMemoryError e) {
throw e;
} catch (Throwable e){
log.error("Unexpected error while extraction links from url '" + u + "':"+e);
e.printStackTrace();
// continue processing
}
// filter and store the document
if ((docManager != null)) {
try {
if (filters != null) {
doc = filters.process(doc);
} else {
log.debug("No filters defined");
}
if (isProcessingAllowed(doc)) {
docManager.processDocument(doc);
} else {
String md5 = doc.getHeaderValue(HttpHeader.CONTENT_MD5);
doc.setContent("Not for indexing".getBytes());
doc.setHeaderValue(HttpHeader.CONTENT_MD5, md5);
}
try {
docManager.storeDocument(doc);
} catch (Exception e) {
log.warn("could not store (not for indexing) " + urlString + ": " + e.getMessage());
}
if (activatedContentHistory && duplicate==null) {
setContentVisitedURL(doc, urlString);
}
} catch (DocManagerException e1) {
log.error("could not process document: " + e1.getMessage());
exceptionHandler.handleException(this, u, e1);
} catch (FilterException e2) {
log.error(e2.getMessage());
}
}
} else {
// it was NOT a 200 return code !
if (doc.isRedirect()) {
String ref = doc.getLocation();
log.info("Got redirect to " + ref);
try {
URL u2 = new URL(u, ref);
// is it on another host ?
// On a redirect, browsers use the old Referer instead of the
// URL that got this redirect
// Therefore we do not use u.toString as Referer but the old Referer