}
protected void processGeneralTag(CrawlURI curi, Element element,
Attributes attributes) {
Attribute attr;
String attrValue;
List<Attribute> attrList;
String elementName = element.getName();
// Just in case it's an OBJECT or APPLET tag
String codebase = null;
ArrayList<String> resources = null;
final boolean framesAsEmbeds = getTreatFramesAsEmbedLinks();
final boolean ignoreFormActions = getIgnoreFormActionUrls();
final boolean overlyEagerLinkDetection = getExtractValueAttributes();
// HREF
if (((attr = attributes.get("href")) != null) &&
((attrValue = attr.getValue()) != null)) {
CharSequence context = elementContext(elementName, attr
.getKey());
if ("link".equals(elementName)) {
// <LINK> elements treated as embeds (css, ico, etc)
processEmbed(curi, attrValue, context);
} else {
// other HREFs treated as links
processLink(curi, attrValue, context);
}
if ("base".equals(elementName)) {
try {
UURI base = UURIFactory.getInstance(attrValue);
curi.setBaseURI(base);
} catch (URIException e) {
logUriError(e, curi.getUURI(), attrValue);
}
}
}
// ACTION
if (((attr = attributes.get("action")) != null) &&
((attrValue = attr.getValue()) != null)) {
if (!ignoreFormActions) {
CharSequence context = elementContext(elementName, attr
.getKey());
processLink(curi, attrValue, context);
}
}
// ON_
if ((attrList = findOnAttributes(attributes)).size() != 0) {
for (Iterator<Attribute> attrIter = attrList.iterator(); attrIter.hasNext();) {
attr = (Attribute) attrIter.next();
CharSequence valueSegment = attr.getValueSegment();
if (valueSegment != null)
processScriptCode(curi, valueSegment);
}
}
// SRC atc.
if ((((attr = attributes.get("src")) != null)
|| ((attr = attributes.get("lowsrc")) != null)
|| ((attr = attributes.get("background")) != null)
|| ((attr = attributes.get("cite")) != null)
|| ((attr = attributes.get("longdesc")) != null)
|| ((attr = attributes.get("usemap")) != null)
|| ((attr = attributes.get("profile")) != null)
|| ((attr = attributes.get("datasrc")) != null)) &&
((attrValue = attr.getValue()) != null)) {
final Hop hopType;
CharSequence context = elementContext(elementName, attr.getKey());
if (!framesAsEmbeds
&& ("frame".equals(elementName) || "iframe"
.equals(elementName)))
hopType = Hop.NAVLINK;
else
hopType = Hop.EMBED;
processEmbed(curi, attrValue, context, hopType);
}
// CODEBASE
if (((attr = attributes.get("codebase")) != null) &&
((attrValue = attr.getValue()) != null)) {
codebase = StringEscapeUtils.unescapeHtml(attrValue);
CharSequence context = elementContext(elementName, attr.getKey());
processEmbed(curi, codebase, context);
}
// CLASSID DATA
if ((((attr = attributes.get("classid")) != null)
|| ((attr = attributes.get("data")) != null)) &&
((attrValue = attr.getValue()) != null)) {
if (resources == null)
resources = new ArrayList<String>();
resources.add(attrValue);
}
// ARCHIVE
if (((attr = attributes.get("archive")) != null) &&
((attrValue = attr.getValue()) != null)) {
if (resources == null)
resources = new ArrayList<String>();
String[] multi = TextUtils.split(WHITESPACE, attrValue);
for (int i = 0; i < multi.length; i++) {
resources.add(multi[i]);
}
}
// CODE
if (((attr = attributes.get("code")) != null) &&
((attrValue = attr.getValue()) != null)) {
if (resources == null)
resources = new ArrayList<String>();
// If element is applet and code value does not end with
// '.class' then append '.class' to the code value.
if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {
resources.add(attrValue + CLASSEXT);
} else {
resources.add(attrValue);
}
}
// VALUE
if (((attr = attributes.get("value")) != null) &&
((attrValue = attr.getValue()) != null)) {
CharSequence valueContext = elementContext(elementName, attr.getKey());
if("PARAM".equalsIgnoreCase(elementName)
&& "flashvars".equalsIgnoreCase(attributes.get("name").getValue())) {
// special handling for <PARAM NAME='flashvars" VALUE="">
String queryStringLike = attrValue.toString();
// treat value as query-string-like "key=value[;key=value]*" pairings
considerQueryStringValues(curi, queryStringLike, valueContext,Hop.SPECULATIVE);
} else {
// regular VALUE handling
if (overlyEagerLinkDetection) {
considerIfLikelyUri(curi,attrValue,valueContext,Hop.NAVLINK);
}
}
}
// STYLE
if (((attr = attributes.get("style")) != null) &&
((attrValue = attr.getValue()) != null)) {
// STYLE inline attribute
// then, parse for URIs
numberOfLinksExtracted.addAndGet(ExtractorCSS.processStyleCode(
this, curi, attrValue));
}
// FLASHVARS
if (((attr = attributes.get("flashvars")) != null) &&
((attrValue = attr.getValue()) != null)) {
// FLASHVARS inline attribute
CharSequence valueContext = elementContext(elementName, attr.getKey());
considerQueryStringValues(curi, attrValue, valueContext,Hop.SPECULATIVE);
}
// handle codebase/resources
if (resources == null)