public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
final String href = tagopts.getProperty("href", "");
MultiProtocolURI url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String f = url.getFile();
final int p = f.lastIndexOf('.');
final String type = (p < 0) ? "" : f.substring(p + 1);
if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) {
// special handling of such urls: put them to the image urls
final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
addImage(this.images, ie);
} else {
tagopts.put("name", recursiveParse(text));
this.anchors.put(url, tagopts);
}
}
this.evaluationScores.match(Element.apath, href);
}
final String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.headlines[0].add(h);
} else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.headlines[1].add(h);
} else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.headlines[2].add(h);
} else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.headlines[3].add(h);
} else if ((tagname.equalsIgnoreCase("h5")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.headlines[4].add(h);
} else if ((tagname.equalsIgnoreCase("h6")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.headlines[5].add(h);
} else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
this.title = recursiveParse(text);
} else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.bold.inc(h);
} else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.bold.inc(h);
} else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.italic.inc(h);
} else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.li.add(h);
} else if (tagname.equalsIgnoreCase("iframe")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
this.anchors.put(src, tagopts /* with property "name" */);
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false));
} else if (tagname.equalsIgnoreCase("script")) {
final String src = tagopts.getProperty("src", "");
if (src.length() > 0) {
this.script.add(absolutePath(src));
this.evaluationScores.match(Element.scriptpath, src);
} else {
this.evaluationScores.match(Element.scriptcode, text);
}