Package com.almilli.util

Source Code of com.almilli.util.NodeUtils

package com.almilli.util;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeList;

public class NodeUtils {

    public static String getTextData(NodeList nodes) {
        if (nodes == null) {
            return null;
        }
        return getTextData(nodes, 0, nodes.size());
    }

  public static String getTextData(NodeList nodes, int start, int end) {
    String str;
    StringBuilder sb = new StringBuilder();
    for (int i=start; i < end; i++) {
      Node node = nodes.elementAt(i);
      str = getTextData(node);
      if (str != null) {
        sb.append(str);
      }
    }
   
    str = sb.toString().trim();
    if (str.length() == 0) {
      return null;
    }
    return str;
  }
 
  public static String getTextData(Node node) {
        if (node == null) {
            return null;
        }
    String text;
    if (node instanceof TextNode) {
      text = ((TextNode)node).getText();
    } else {
      NodeList list = node.getChildren();
      if (list != null) {
        text = list.asString();
      } else {
        return null;
      }
    }
    return convertTextData(text);
  }
 
    public static String convertTextData(String text) {
        text = convertSpecialEntities(text);
    text = text.trim();
    if (text.length() > 0) {
      return convertSpecialEntitiesToText(text);
    } else {
      return null;
    }
    }

    public static String convertSpecialEntities(String text) {
        //first convert all the named entities
        text = text.replace("&nbsp;", " ");
        text = text.replace("&quot;", "\"");
        text = text.replace("&gt;", ">");
        text = text.replace("&lt;", "<");
        text = text.replace("&amp;", "&");
       
        //next convert all the numeric coded entities
        Pattern pattern = Pattern.compile("&#(\\d+);");
        Matcher matcher = pattern.matcher(text);
        boolean result = matcher.find();
        if (result) {
            StringBuffer sb = new StringBuffer();
            do {
                char code = (char)Integer.parseInt(matcher.group(1));
                matcher.appendReplacement(sb, Character.toString(code));
                result = matcher.find();
            } while (result);
            matcher.appendTail(sb);
            return sb.toString();
        }
        return text;
    }
   
    public static String removeNonPrintableChars(String str) {
        if (str == null) {
            return null;
        }
        StringBuilder sb = new StringBuilder(str.length());
        char ch;
        for (int i=0; i < str.length(); i++) {
            ch = str.charAt(i);
            if (!Character.isIdentifierIgnorable(ch)) {
                sb.append(ch);
            }
        }
        return sb.toString();
    }
   
    public static String convertSpecialEntitiesToText(String str) {
        if (str.indexOf('&') != -1) {
            str = str.replace("&amp;", "&");
        }
        return str;
    }
}
TOP

Related Classes of com.almilli.util.NodeUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.