continue;
}
}
// Create a raw document
RawDocument rawDocument;
try {
rawDocument = new RawDocument(url, mCurrentJob.getSourceUrl(),
mCurrentJob.getSourceLinkText(),
CrawlerToolkit.findAuthenticationValuesForURL(url, accountPasswordStore));
} catch (RedirectException exc) {
String redirectUrl = exc.getRedirectUrl();
mLog.info("Redirect '" + url + "' -> '" + redirectUrl + "'");
mUrlChecker.setIgnored(url);
// the RedirectURL inherit the properties for shouldBeParsed, shouldBeIndexed from the
// sourceURL. This is possibly not right according to definitions in the whitelist
addJob(redirectUrl, mCurrentJob.getSourceUrl(), shouldBeParsed,
shouldBeIndexed, mCurrentJob.getSourceLinkText());
mCrawlerJobProfiler.stopMeasuring(0);
continue;
}
catch (RegainException exc) {
// Check whether the exception was caused by a dead link
handleDocumentLoadingException(exc, mCurrentJob);
// This document does not exist -> We can't parse or index anything
// -> continue
mCrawlerJobProfiler.abortMeasuring();
continue;
}
if( shouldBeIndexed || shouldBeParsed ){
if (mLog.isDebugEnabled()) {
mLog.debug("Parsing and indexing " + rawDocument.getUrl());
}
mHtmlParsingProfiler.startMeasuring();
// Parse and index content and metadata
if (shouldBeIndexed) {
try {
mIndexWriterManager.addToIndex(rawDocument, this);
}
catch (RegainException exc) {
logError("Indexing failed for: " + rawDocument.getUrl(), exc, false);
}
}
// Extract links form the document (parse=true). The real meaning of parse in this context
// is link-extraction. The document is parsed anyway (building a html-node tree).
if (shouldBeParsed) {
if(!shouldBeIndexed){
// The document is not parsed so parse it
mIndexWriterManager.getDocumentFactory().createDocument(rawDocument, this);
}
try {
//parseHtmlDocument(rawDocument);
createCrawlerJobs(rawDocument);
}
catch (RegainException exc) {
logError("CrawlerJob creation failed for: " + rawDocument.getUrl(), exc, false);
}
}
mHtmlParsingProfiler.stopMeasuring(rawDocument.getLength());
}
// System-Ressourcen des RawDocument wieder frei geben.
rawDocument.dispose();
// Zeitmessung stoppen
mCrawlerJobProfiler.stopMeasuring(rawDocument.getLength());
mCurrentJob = null;
// Check whether to create a breakpoint
int breakpointInterval = mConfiguration.getBreakpointInterval();
boolean breakpointIntervalIsOver = (breakpointInterval > 0)