* @param curi CrawlURI to process.
*/
protected boolean innerExtract(CrawlURI curi){
int links = 0;
InputStream contentStream = null;
ReplayInputStream documentStream = null;
SeekReader docReader = null;
// Get the doc as a repositionable reader
try
{
contentStream = curi.getRecorder().getContentReplayInputStream();
if (contentStream==null) {
// TODO: note problem
return false;
}
documentStream = new ReplayInputStream(contentStream);
docReader = Doc.getText(documentStream);
} catch(Exception e){
curi.getNonFatalFailures().add(e);
return false;
} finally {
IOUtils.closeQuietly(contentStream);
}
CharSequence cs = new SeekReaderCharSequence(docReader, 0);
Matcher m = PATTERN.matcher(cs);
while (m.find()) {
links++;
addLink(curi, m.group(1));
}
documentStream.destroy();
logger.fine(curi + " has " + links + " links.");
return true;
}