// Get the title
String title = extractHtmlTitle(rawDocument.getContentAsString());
setTitle(title);
// Find the content extractor that is responsible for this document
HtmlContentExtractor contentExtractor = null;
if (mContentExtractorArr != null) {
for (int i = 0; i < mContentExtractorArr.length; i++) {
if (mContentExtractorArr[i].accepts(rawDocument)) {
contentExtractor = mContentExtractorArr[i];
}
}
}
// Cut the content and extract the headlines
String cuttedContent;
String headlines;
boolean isContentCutted = false;
if (contentExtractor == null) {
// There is no HtmlContentExtractor responsible for this document
if (mLog.isDebugEnabled()) {
mLog.debug("No HTML content extractor is responsible for " + rawDocument.getUrl());
}
cuttedContent = rawDocument.getContentAsString();
headlines = null;
} else {
cuttedContent = contentExtractor.extractContent(rawDocument);
headlines = contentExtractor.extractHeadlines(cuttedContent);
if (!cuttedContent.equals(rawDocument.getContentAsString())) {
isContentCutted = true;
}
}