package com.almilli.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;
public class NodeUtils {
public static String getTextData(NodeList nodes) {
if (nodes == null) {
return null;
}
return getTextData(nodes, 0, nodes.size());
}
public static String getTextData(NodeList nodes, int start, int end) {
String str;
StringBuilder sb = new StringBuilder();
for (int i=start; i < end; i++) {
Node node = nodes.elementAt(i);
str = getTextData(node);
if (str != null) {
sb.append(str);
}
}
str = sb.toString().trim();
if (str.length() == 0) {
return null;
}
return str;
}
public static String getTextData(Node node) {
if (node == null) {
return null;
}
String text;
if (node instanceof TextNode) {
text = ((TextNode)node).getText();
} else {
NodeList list = node.getChildren();
if (list != null) {
text = list.asString();
} else {
return null;
}
}
return convertTextData(text);
}
public static String convertTextData(String text) {
text = convertSpecialEntities(text);
text = text.trim();
if (text.length() > 0) {
return convertSpecialEntitiesToText(text);
} else {
return null;
}
}
public static String convertSpecialEntities(String text) {
//first convert all the named entities
text = text.replace(" ", " ");
text = text.replace(""", "\"");
text = text.replace(">", ">");
text = text.replace("<", "<");
text = text.replace("&", "&");
//next convert all the numeric coded entities
Pattern pattern = Pattern.compile("&#(\\d+);");
Matcher matcher = pattern.matcher(text);
boolean result = matcher.find();
if (result) {
StringBuffer sb = new StringBuffer();
do {
char code = (char)Integer.parseInt(matcher.group(1));
matcher.appendReplacement(sb, Character.toString(code));
result = matcher.find();
} while (result);
matcher.appendTail(sb);
return sb.toString();
}
return text;
}
public static String removeNonPrintableChars(String str) {
if (str == null) {
return null;
}
StringBuilder sb = new StringBuilder(str.length());
char ch;
for (int i=0; i < str.length(); i++) {
ch = str.charAt(i);
if (!Character.isIdentifierIgnorable(ch)) {
sb.append(ch);
}
}
return sb.toString();
}
public static String convertSpecialEntitiesToText(String str) {
if (str.indexOf('&') != -1) {
str = str.replace("&", "&");
}
return str;
}
}