final Pattern p = Pattern.compile("(href=\"|src=\")([^\"]+)|(href='|src=')([^']+)|(url\\(')([^']+)|(url\\(\")([^\"]+)|(url\\()([^\\)]+)");
final Matcher m = p.matcher(sbuffer);
final StringBuffer result = new StringBuffer(80);
String init, url;
MultiProtocolURI target;
while (m.find()) {
init = null;
if(m.group(1) != null) init = m.group(1);
if(m.group(3) != null) init = m.group(3);
if(m.group(5) != null) init = m.group(5);
if(m.group(7) != null) init = m.group(7);
if(m.group(9) != null) init = m.group(9);
url = null;
if(m.group(2) != null) url = m.group(2);
if(m.group(4) != null) url = m.group(4);
if(m.group(6) != null) url = m.group(6);
if(m.group(8) != null) url = m.group(8);
if(m.group(10) != null) url = m.group(10);
if (url.startsWith("data:") || url.startsWith("#") || url.startsWith("mailto:") || url.startsWith("javascript:")) {
String newurl = init + url;
newurl = newurl.replaceAll("\\$","\\\\\\$");
m.appendReplacement(result, newurl);
} else if (url.startsWith("http")) {
// absoulte url of form href="http://domain.com/path"
if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI(url)) != null) {
continue;
}
}
String newurl = init + "/proxy.html?url=" + url;
newurl = newurl.replaceAll("\\$","\\\\\\$");
m.appendReplacement(result, newurl);
} else if (url.startsWith("//")) {
// absoulte url but same protocol of form href="//domain.com/path"
final String complete_url = proxyurl.getProtocol() + ":" + url;
if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI(complete_url)) != null) {
continue;
}
}
String newurl = init + "/proxy.html?url=" + complete_url;
newurl = newurl.replaceAll("\\$","\\\\\\$");
m.appendReplacement(result, newurl);
} else if (url.startsWith("/")) {
// absolute path of form href="/absolute/path/to/linked/page"
String newurl = init + "/proxy.html?url=http://" + host + url;
newurl = newurl.replaceAll("\\$","\\\\\\$");
m.appendReplacement(result, newurl);
} else {
// relative path of form href="relative/path"
try {
target = new MultiProtocolURI("http://" + host + directory + "/" + url);
String newurl = init + "/proxy.html?url=" + target.toString();
newurl = newurl.replaceAll("\\$","\\\\\\$");
m.appendReplacement(result, newurl);
}
catch (final MalformedURLException e) {}