addSolr(solrdoc, "lat_coordinate", yacydoc.lat());
}
addSolr(solrdoc, "httpstatus_i", 200);
final Object parser = yacydoc.getParserObject();
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
// header tags
int h = 0;
int f = 1;
for (int i = 1; i <= 6; i++) {
final String[] hs = html.getHeadlines(i);
h = h | (hs.length > 0 ? f : 0);
f = f * 2;
addSolr(solrdoc, "attr_h" + i, hs);
}
addSolr(solrdoc, "htags_i", h);
// canonical tag
if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false));
// noindex and nofollow attributes
// from HTML (meta-tag in HTML header: robots)
// and HTTP header (x-robots property)
// coded as binary value:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
// bit 2: "noindex" contained in html header meta
// bit 3: "nofollow" contained in html header meta
// bit 8: "noarchive" contained in http header properties
// bit 9: "nosnippet" contained in http header properties
// bit 10: "noindex" contained in http header properties
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
int b = 0;
final String robots_meta = html.getMetas().get("robots");
// this tag may have values: all, index, noindex, nofollow
if (robots_meta != null) {
if (robots_meta.indexOf("all",0) >= 0) b += 1; // set bit 0
if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1
if (robots_meta.indexOf("noindex",0) >= 0) b += 4; // set bit 2
if (robots_meta.indexOf("nofollow",0) >= 0) b += 8; // set bit 3
}
String x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, "");
if (x_robots_tag.length() == 0) x_robots_tag = header.get(HeaderFramework.X_ROBOTS, "");
// this tag may have values: noarchive, nosnippet, noindex, unavailable_after
if (x_robots_tag.length() > 0) {
if (x_robots_tag.indexOf("noarchive",0) >= 0) b += 256; // set bit 8
if (x_robots_tag.indexOf("nosnippet",0) >= 0) b += 512; // set bit 9
if (x_robots_tag.indexOf("noindex",0) >= 0) b += 1024; // set bit 10
if (x_robots_tag.indexOf("nofollow",0) >= 0) b += 2048; // set bit 11
if (x_robots_tag.indexOf("unavailable_after",0) >=0) b += 4096; // set bit 12
}
addSolr(solrdoc, "robots_i", b);
// meta tags: generator
final String generator = html.getMetas().get("generator");
if (generator != null) addSolr(solrdoc, "metagenerator_t", generator);
// bold, italic
final String[] bold = html.getBold();
addSolr(solrdoc, "boldcount_i", bold.length);
if (bold.length > 0) {
addSolr(solrdoc, "attr_bold", bold);
if (isEmpty() || contains("attr_boldcount")) {
addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold));
}
}
final String[] italic = html.getItalic();
addSolr(solrdoc, "italiccount_i", italic.length);
if (italic.length > 0) {
addSolr(solrdoc, "attr_italic", italic);
if (isEmpty() || contains("attr_italiccount")) {
addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic));
}
}
final String[] li = html.getLi();
addSolr(solrdoc, "licount_i", li.length);
if (li.length > 0) addSolr(solrdoc, "attr_li", li);
// images
final Collection<ImageEntry> imagesc = html.getImages().values();
final String[] imgtags = new String[imagesc.size()];
final String[] imgprots = new String[imagesc.size()];
final String[] imgstubs = new String[imagesc.size()];
final String[] imgalts = new String[imagesc.size()];
c = 0;
for (final ImageEntry ie: imagesc) {
final MultiProtocolURI uri = ie.url();
imgtags[c] = ie.toString();
imgprots[c] = uri.getProtocol();
imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3);
imgalts[c] = ie.alt();
c++;
}
addSolr(solrdoc, "imagescount_i", imgtags.length);
if (isEmpty() || contains("attr_images_tag")) addSolr(solrdoc, "attr_images_tag", imgtags);
if (isEmpty() || contains("attr_images_protocol")) addSolr(solrdoc, "attr_images_protocol", imgprots);
if (isEmpty() || contains("attr_images_urlstub")) addSolr(solrdoc, "attr_images_urlstub", imgstubs);
if (isEmpty() || contains("attr_images_alt")) addSolr(solrdoc, "attr_images_alt", imgalts);
// style sheets
if (isEmpty() || contains("attr_css")) {
final Map<MultiProtocolURI, String> csss = html.getCSS();
final String[] css_tag = new String[csss.size()];
final String[] css_url = new String[csss.size()];
c = 0;
for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
final String url = entry.getKey().toNormalform(false, false, false, false);
css_tag[c] =
"<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
" href=\""+ url + "\" />";
css_url[c] = url;
c++;
}
addSolr(solrdoc, "csscount_i", css_tag.length);
if (css_tag.length > 0) addSolr(solrdoc, "attr_css_tag", css_tag);
if (css_url.length > 0) addSolr(solrdoc, "attr_css_url", css_url);
}
// Scripts
if (isEmpty() || contains("attr_scripts")) {
final Set<MultiProtocolURI> scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()];
c = 0;
for (final MultiProtocolURI url: scriptss) {
scripts[c++] = url.toNormalform(false, false, false, false);
}
addSolr(solrdoc, "scriptscount_i", scripts.length);
if (scripts.length > 0) addSolr(solrdoc, "attr_scripts", scripts);
}
// Frames
if (isEmpty() || contains("attr_frames")) {
final Set<MultiProtocolURI> framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
for (final MultiProtocolURI entry: framess) {
frames[c++] = entry.toNormalform(false, false, false, false);
}
addSolr(solrdoc, "framesscount_i", frames.length);
if (frames.length > 0) addSolr(solrdoc, "attr_frames", frames);
}
// IFrames
if (isEmpty() || contains("attr_iframes")) {
final Set<MultiProtocolURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
for (final MultiProtocolURI entry: iframess) {
iframes[c++] = entry.toNormalform(false, false, false, false);
}
addSolr(solrdoc, "iframesscount_i", iframes.length);
if (iframes.length > 0) addSolr(solrdoc, "attr_iframes", iframes);
}
// flash embedded
addSolr(solrdoc, "flash_b", html.containsFlash());
// generic evaluation pattern
for (final String model: html.getEvaluationModelNames()) {
if (isEmpty() || contains("attr_" + model)) {
final String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) {
addSolr(solrdoc, "attr_" + model, scorenames);
addSolr(solrdoc, "attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames));
}
}
}
// response time