final ArrayList<String> list = new ArrayList<String>();
HtmlCleaner cleaner = new HtmlCleaner();
//CleanerProperties props = cleaner.getProperties();
//props.setXXX(...);
TagNode node = cleaner.clean(rawPage);
TagNode[] myNodes;
if (depth==1 || depth==2) {
// <a href=
myNodes = node.getElementsByName("a", true);
for (int i=0;i<myNodes.length;i++)
{
String link = myNodes[i].getAttributeByName("href");
if (link!=null) {
link = link.trim();
if (link!=null && !"".equals(link))
{
if (isValidUrl(link))
if (!list.contains(link))
list.add(link);
}
}
}
// <area href=
myNodes = node.getElementsByName("area", true);
for (int i=0;i<myNodes.length;i++)
{
String link = myNodes[i].getAttributeByName("href");
if (link!=null && !"".equals(link))
if (isValidUrl(link))
if (!list.contains(link))
list.add(link);
}
}
if (depth==0 || depth==2) {
// <frame src=
myNodes = node.getElementsByName("frame", true);
for (int i=0;i<myNodes.length;i++)
{
String link = myNodes[i].getAttributeByName("src");
if (link!=null && !"".equals(link))
if (isValidUrl(link))
if (!list.contains(link))
list.add(link);
}
// <iframe src=
myNodes = node.getElementsByName("iframe", true);
for (int i=0;i<myNodes.length;i++)
{
String link = myNodes[i].getAttributeByName("src");
if (link!=null && !"".equals(link))
if (isValidUrl(link))
if (!list.contains(link))
list.add(link);
}
// <meta http-equiv="refresh" content=
myNodes = node.getElementsByName("meta", true);
for (int i=0;i<myNodes.length;i++)
{
String equiv = myNodes[i].getAttributeByName("http-equiv");
if ((equiv!=null) && (equiv.equalsIgnoreCase("refresh")))
{
String link = myNodes[i].getAttributeByName("content");
if (link!=null && !"".equals(link))
{
if (link.indexOf("=")>0)
{
link = link.substring(link.indexOf("=")+1);
if (!list.contains(link))
list.add(link);
}
}
}
}
// Look for embeded flash
// <param name="movie" value="..."
myNodes = node.getElementsByName("param", true);
for (int i=0;i<myNodes.length;i++)
{
String name = myNodes[i].getAttributeByName("name");
if ("movie".equals(name))
{