movedToUrl = URLCanonicalizer.getCanonicalURL(movedToUrl);
int newdocid = DocIDServer.getDocID(movedToUrl);
if (newdocid > 0) {
return PageFetchStatus.RedirectedPageIsSeen;
} else {
WebURL webURL = new WebURL();
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setDepth((short) (curURL.getDepth()));
webURL.setDocid(-1);
if (shouldVisit(webURL) && RobotstxtServer.allows(webURL)) {
webURL.setDocid(DocIDServer.getNewDocID(movedToUrl));
Frontier.schedule(webURL);
}
}
}
return PageFetchStatus.Moved;
} else if (statusCode == PageFetchStatus.PageTooBig) {
logger.error("Page was bigger than max allowed size: " + curURL.getURL());
}
return statusCode;
}
try {
if (!page.isBinary()) {
htmlParser.parse(page.getHTML(), curURL.getURL());
page.setText(htmlParser.getText());
page.setTitle(htmlParser.getTitle());
if (page.getText() == null) {
return PageFetchStatus.NotInTextFormat;
}
Iterator<String> it = htmlParser.getLinks().iterator();
List<WebURL> toSchedule = new ArrayList<WebURL>();
List<WebURL> toList = new ArrayList<WebURL>();
while (it.hasNext()) {
String url = it.next();
if (url != null) {
int newdocid = DocIDServer.getDocID(url);
if (newdocid > 0) {
if (newdocid != docid) {
WebURL webURL = new WebURL();
webURL.setURL(url);
webURL.setDocid(newdocid);
toList.add(webURL);
}
} else {
WebURL webURL = new WebURL();
webURL.setURL(url);
webURL.setDocid(-1);
webURL.setParentDocid(docid);
webURL.setDepth((short) (curURL.getDepth() + 1));
if (shouldVisit(webURL) && RobotstxtServer.allows(webURL)) {
if (MAX_CRAWL_DEPTH == -1 || curURL.getDepth() < MAX_CRAWL_DEPTH) {
webURL.setDocid(DocIDServer.getNewDocID(url));
toSchedule.add(webURL);
toList.add(webURL);
}
}
}