}
}
// Using HTMLParser to extract the content
String cleanedContent = null;
Page htmlPage = new Page(cuttedContent, "UTF-8");
Parser parser = new Parser(new Lexer(htmlPage));
StringBean stringBean = new StringBean();
// replace multiple whitespace with one whitespace
stringBean.setCollapse(true);
// Do not extract URLs
stringBean.setLinks(false);
// replace with whitespace
stringBean.setReplaceNonBreakingSpaces(true);
try {
// Parse the content
parser.visitAllNodesWith(stringBean);
cleanedContent = stringBean.getStrings();
} catch (ParserException ex) {
throw new RegainException("Error while parsing content: ", ex);
}
// The result of parsing the html-content
setCleanedContent(cleanedContent);
// Extract links
LinkVisitor linkVisitor = new LinkVisitor();
if (isContentCutted) {
// This means a new parser run which is expensive but neccessary
htmlPage = new Page(rawDocument.getContentAsString(), "UTF-8");
parser = new Parser(new Lexer(htmlPage));
} else {
parser.reset();
}
try {
// Parse the content
parser.visitAllNodesWith(linkVisitor);
ArrayList<Tag> links = linkVisitor.getLinks();
htmlPage.setBaseUrl(rawDocument.getUrl());
// Iterate over all links found
Iterator linksIter = links.iterator();
while (linksIter.hasNext()) {
LinkTag currTag = ((LinkTag) linksIter.next());