Package org.apache.nutch.util

Examples of org.apache.nutch.util.NodeWalker


   * nekohtml).
   */
  public void getOutlinks(URL base, ArrayList<Outlink> outlinks,
                                       Node node) {
   
    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {
     
      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();     
      NodeList children = currentNode.getChildNodes();
      int childLen = (children != null) ? children.getLength() : 0;
     
      if (nodeType == Node.ELEMENT_NODE) {
       
        nodeName = nodeName.toLowerCase();
        LinkParams params = (LinkParams)linkParams.get(nodeName);
        if (params != null) {
          if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
 
            StringBuffer linkText = new StringBuffer();
            getText(linkText, currentNode, true);
            if (linkText.toString().trim().length() == 0) {
              // try harder - use img alt if present
              NodeWalker subWalker = new NodeWalker(currentNode);
              while (subWalker.hasNext()) {
                Node subNode = subWalker.nextNode();
                if (subNode.getNodeType() == Node.ELEMENT_NODE) {
                  if (subNode.getNodeName().toLowerCase().equals("img")) {
                    NamedNodeMap subAttrs = subNode.getAttributes();
                    Node alt = subAttrs.getNamedItem("alt");
                    if (alt != null) {
View Full Code Here


  /**
   * Finds the specified element and returns its value
   */
  protected String getElement(String element) {
    NodeWalker walker = new NodeWalker(doc);

    while (walker.hasNext()) {
      Node currentNode = walker.nextNode();

      if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
        if (element.equalsIgnoreCase(currentNode.getNodeName())) {
          return getNodeValue(currentNode);
        }
View Full Code Here

      return language;
    }

    void parse(Node node) {

      NodeWalker walker = new NodeWalker(node);
      while (walker.hasNext()) {

        Node currentNode = walker.nextNode();
        String nodeName = currentNode.getNodeName();
        short nodeType = currentNode.getNodeType();

        if (nodeType == Node.ELEMENT_NODE) {
View Full Code Here

  // anchors
  private boolean getTextHelper(StringBuilder sb, Node node,
                                             boolean abortOnNestedAnchors,
                                             int anchorDepth) {
    boolean abort = false;
    NodeWalker walker = new NodeWalker(node);
   
    while (walker.hasNext()) {
   
      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
     
      if ("script".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if ("style".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
        anchorDepth++;
        if (anchorDepth > 1) {
          abort = true;
          break;
        }       
      }
      if (nodeType == Node.COMMENT_NODE) {
        walker.skipChildren();
      }
      if (nodeType == Node.TEXT_NODE) {
        // cleanup and trim the value
        String text = currentNode.getNodeValue();
        text = text.replaceAll("\\s+", " ");
View Full Code Here

  // anchors
  private boolean getTextHelper(StringBuffer sb, Node node,
                                             boolean abortOnNestedAnchors,
                                             int anchorDepth) {
    boolean abort = false;
    NodeWalker walker = new NodeWalker(node);
   
    while (walker.hasNext()) {
   
      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
     
      if ("script".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if ("style".equalsIgnoreCase(nodeName)) {
        walker.skipChildren();
      }
      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
        anchorDepth++;
        if (anchorDepth > 1) {
          abort = true;
          break;
        }       
      }
      if (nodeType == Node.COMMENT_NODE) {
        walker.skipChildren();
      }
      if (nodeType == Node.TEXT_NODE) {
        // cleanup and trim the value
        String text = currentNode.getNodeValue();
        text = text.replaceAll("\\s+", " ");
View Full Code Here

   *
   * @return true if a title node was found, false otherwise
   */
  public boolean getTitle(StringBuffer sb, Node node) {
   
    NodeWalker walker = new NodeWalker(node);
   
    while (walker.hasNext()) {
 
      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
     
      if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
        return false;
View Full Code Here

  }

  /** If Node contains a BASE tag then it's HREF is returned. */
  URL getBase(Node node) {

    NodeWalker walker = new NodeWalker(node);
   
    while (walker.hasNext()) {
 
      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
     
      // is this node a BASE tag?
      if (nodeType == Node.ELEMENT_NODE) {
View Full Code Here

   * nekohtml).
   */
  public void getOutlinks(URL base, ArrayList outlinks,
                                       Node node) {
   
    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {
     
      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();     
      NodeList children = currentNode.getChildNodes();
      int childLen = (children != null) ? children.getLength() : 0;
     
View Full Code Here

   *
   * @return true if a title node was found, false otherwise
   */
  public boolean getTitle(StringBuilder sb, Node node) {
   
    NodeWalker walker = new NodeWalker(node);
   
    while (walker.hasNext()) {
 
      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
     
      if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
        return false;
View Full Code Here

  }

  /** If Node contains a BASE tag then it's HREF is returned. */
  public URL getBase(Node node) {

    NodeWalker walker = new NodeWalker(node);
   
    while (walker.hasNext()) {
 
      Node currentNode = walker.nextNode();
      String nodeName = currentNode.getNodeName();
      short nodeType = currentNode.getNodeType();
     
      // is this node a BASE tag?
      if (nodeType == Node.ELEMENT_NODE) {
View Full Code Here

TOP

Related Classes of org.apache.nutch.util.NodeWalker

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.