Package org.vietspider.html.path2

Examples of org.vietspider.html.path2.NodePathParser


        document : new HTMLAnchorUtil().searchDocument(document, ref);
  }
 
  public NodePath findNodeByText(HTMLNode node, String start, String end) throws Exception {
    RefsDecoder decoder = new RefsDecoder();
    NodePathParser pathParser = new NodePathParser();
    TextHandler textHandler = new TextHandler();
    if(start == null || start.trim().length() == 0) return pathParser.toPath(node);
    start = textHandler.trim(start);
    HTMLNode startNode = textHandler.findByText(node, start, decoder);
    if(end == null || end.trim().length() == 0) {
      return startNode != null ?  pathParser.toPath(startNode) : pathParser.toPath(node);
    }
    end = textHandler.trim(end);
   
    HTMLNode endNode = textHandler.findByText(node, end, decoder);   
    if(endNode == null)
      return startNode != null ? pathParser.toPath(startNode) : pathParser.toPath(node);
    if(startNode  == null
      return endNode != null  ?  pathParser.toPath(endNode) : pathParser.toPath(node);
    HTMLNodeUtil nodeUtil = new HTMLNodeUtil()
    String indexPath = nodeUtil.getCommonIndexPath(startNode, endNode);
    return pathParser.toPath(nodeUtil.getNodeByIndex(node, indexPath));
  }
View Full Code Here


   
    return txt;
 
 
  public short traverseTree(Tree tree, String path, int style, short type) throws Exception {
    NodePath nodePath = new NodePathParser().toPath(path);
    return traverseTree(tree, nodePath, path, style, type);
  }
View Full Code Here

    }
  }
 
  private HTMLNode createNode(String path) {
    HTMLExtractor extractor  = new HTMLExtractor();
    NodePathParser pathParser = new NodePathParser();
    if(path.endsWith("[.")) {
      path  = path.substring(0, path.length() - 2);
    } else  if(path.endsWith("[")) {
      path  = path.substring(0, path.length() - 1);
    }
    try {
      NodePath nodePath = pathParser.toPath(path);
      return extractor.lookNode(explorer.getDocument().getRoot(), nodePath);
    } catch (Exception e) {
    }
    return null;
  }
View Full Code Here

  }
 
  private void autoSelect() {
    if(type == HTMLExplorer.NONE) return;
    NodePath bodyPath = null;
    NodePathParser pathParser = new NodePathParser();
    try {
      bodyPath  = pathParser.toPath("BODY");
    }catch (Exception e) {
      ClientLog.getInstance().setException(tree.getShell(), e);
    }
   
    if(bodyPath == null || document == null) return;
    HTMLNode body = new HTMLExtractor().lookNode(document.getRoot(), bodyPath);
    List<HTMLNode> list = new ArrayList<HTMLNode>();
    List<HTMLNode> commons = new ArrayList<HTMLNode>();
    if(HTMLExplorer.SECTION == type) {
      searchSectionCSS(commons, list, body);

      short selectType = PathConfirmDialog.YES;
      for(HTMLNode ele : commons) {
        try {
          NodePath path = pathParser.toPath(ele);  
          if(path == null) continue;
          selectType = handler.traverseTree(tree, path, TreeHandler.MARK, selectType);     
        } catch(Exception exp){
          ClientLog.getInstance().setMessage(tree.getShell(), exp);
       
      }

      return;
    }

    searchContentCSS(commons, list, body);

    int maxCountContent = 0;
    HTMLNode maxNodeContent = null;

    List<HTMLNode> contents = new ArrayList<HTMLNode>();
    CharacterUtil characterUtil = new CharacterUtil();
   
    HTMLText htmlText = new HTMLText();
   
    short selectType = PathConfirmDialog.YES;
    boolean traverse = false;
    for(HTMLNode ele : commons) {
      contents.clear();
      htmlText.searchText(contents, ele);
      int count = countText(characterUtil, contents);

      if(count > maxCountContent) {
        maxCountContent = count;
        maxNodeContent = ele;
      }

      if(count < 100) continue;

      NodePath path = pathParser.toPath(ele);  
      if(path == null) continue;
      handler.traverseTree(tree, path, TreeHandler.MARK, selectType);
      if(!traverse) traverse = true;
    }
   
    if(traverse || maxNodeContent == null) return;
    try {       
      NodePath path = pathParser.toPath(maxNodeContent);  
      if(path == null) return;
      handler.traverseTree(tree, path, TreeHandler.MARK, selectType);     
    }catch(Exception exp){
      ClientLog.getInstance().setMessage(tree.getShell(), exp);
   
View Full Code Here

  public HTMLDocument searchDocument(HTMLDocument document, String name) {
    HTMLNode root = document.getRoot();
    HTMLNode newRoot = searchDocument(root, name);
    if (root == newRoot) return document;
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor extractor = new HTMLExtractor();
    try {
      HTMLNode body = extractor.lookNode(root, pathParser.toPath("BODY"));
      body.clearChildren();
      body.addChild(newRoot);
    } catch (Exception e) {
      LogService.getInstance().setThrowable(e);
    }
View Full Code Here

    URL url = new URL("http://news.google.com.vn/");
   
    WebClient webClient = new WebClient();
    webClient.setURL(null, url);
   
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor htmlExtractor = new HTMLExtractor();
   
    byte  [] bytes = download(webClient, "http://news.google.com.vn/");
   
    HTMLDocument document = new HTMLParser2().createDocument(bytes, null);
   
    String [] paths = {
        "BODY[0].TABLE[2].TBODY[0].TR[0].TD[3].TABLE[1].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].DIV[0].TABLE[0].TBODY[0].TR[0].TD[0].DIV[*]"
    };
   
    NodePath [] nodePaths = new NodePath[paths.length];
    for(int i=0; i<paths.length; i++){
      nodePaths[i] = pathParser.toPath(paths[i]);
    }
   
    HTMLDocument doc = htmlExtractor.extract(document, nodePaths);
    System.out.println(doc.getTextValue());
   
    paths = new String[]{
        "DIV[*].BR[*]",
    };
   
    nodePaths = new NodePath[paths.length];
    for(int i=0; i<paths.length; i++){
      nodePaths[i] = pathParser.toPath(paths[i]);
    }
   
    htmlExtractor.remove(doc.getRoot(), nodePaths);
   
    System.out.println(doc.getRoot().getTextValue());
View Full Code Here

  }

  public void viewNode(String path) throws Exception {
    HTMLParser2 parser2 = new HTMLParser2();
    HTMLDocument document  = parser2.createDocument(file, null);
    NodePathParser pathParser = new NodePathParser();
    NodePath nodePath = pathParser.toPath(path);
    HTMLExtractor htmlExtractor = new HTMLExtractor();
    HTMLNode node = htmlExtractor.extract(document, new NodePath[]{nodePath}).getRoot();
    System.out.println(node.getTextValue());
  }
View Full Code Here

      URL url = new URL("http://www.java.net");
//      System.out.println(URLDecoder.decode("video_id=http%3A%2F%2Fliveu-80.vo.llnwd.net%2Fflurl%2Fmb53%2Fnew_media3%2F2006%2F8%2F29%2F174380_media_flash8.flv&homeurl=http%3A%2F%2Fwww.flurl.com%2F&endmovies=http%3A%2F%2Fwww.flurl.com%2Fthumbs.php%3Fid%3D174380&embed=%3Ctable%20border%3D%270%27%20bgcolor%3D%27ffffff%27%20cellpadding%3D%270%27%20cellspacing%3D%270%27%3E%3Ctr%3E%3Ctd%3E%3Cembed%20id%3D%27flurl_media%27%20name%3D%27flurl_media%27%20width%3D%27519%27%20height%3D%27438%27%20src%3D%27http%3A%2F%2Fwww.flurl.com%2Fflvplayer2.swf%3Fvideo%3Dhttp%3A%2F%2Fwww.flurl.com%2Fflash_player_info.php%3Fid%3D174380%26flash%3D8%27%20quality%3D%27high%27%20bgcolor%3D%27white%27%20play%3D%27true%27%20loop%3D%27false%27%20allowScriptAccess%3D%27sameDomain%27%20type%3D%27application%2Fx-shockwave-flash%27%20pluginspage%3D%27http%3A%2F%2Fwww.macromedia.com%2Fgo%2Fgetflashplayer%27%3E%3C%2Fembed%3E%3C%2Ftd%3E%3C%2Ftr%3E%3Ctr%3E%3Ctd%20align%3D%27right%27%3E%3Cstrong%3E%3Ca%20href%3D%27http%3A%2F%2Fwww.flurl.com%2F%27%3EHosted%20on%20Flurl%20Video%20Search%3C%2Fa%3E%20-%20%3Ca%20href%3D%27http%3A%2F%2Fwww.flurl.com%2Fmedia%27%3EWatch%20More%20Videos%3C%2Fa%3E%20%3C%2Fstrong%3E%3C%2Ftd%3E%3C%2Ftr%3E%3C%2Ftable%3E"));
      HTMLParser2 parser2 = new HTMLParser2();
      HTMLDocument document = parser2.createDocument(url.openStream(), "utf-8");

      NodePathParser pathParser = new NodePathParser();
      NodePath nodePath = pathParser.toPath("BODY[0].DIV[0].TABLE[0].TBODY[0].TR[1].TD[3].DIV[10]");
      HTMLExtractor htmlExtractor = new HTMLExtractor();
      HTMLNode node = htmlExtractor.extract(document, new NodePath[]{nodePath}).getRoot();

      System.out.println(node.getTextValue());
View Full Code Here

   
    String path = "BODY[0].TABLE[0].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0]";
    path += ".TABLE[0].TBODY[0].TR[1].TD[1].TABLE[0].TBODY[0].TR[0].TD[0].TABLE[1].TBODY[0].TR[1]";
    path += ".TD[0].TABLE[1].TBODY[0].TR[i>0]";
   
    NodePathParser pathParser = new NodePathParser();
   
    NodePath nodePath = pathParser.toPath(path);
   
    HTMLExtractor htmlExtractor = new HTMLExtractor();
    document = htmlExtractor.extract(document, new NodePath[]{nodePath});
   
    List<HTMLNode> children = document.getRoot().getChildren();
View Full Code Here

   
    HTMLParser2 parser2 = new HTMLParser2();
    HTMLDocument document  = parser2.createDocument(bytes, "utf-8");
    String titlePathValue  = "BODY[0].DIV[1].TABLE[0].TBODY[0].TR[0].TD[0].P[1]";
   
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor htmlExtractor = new HTMLExtractor();
   
    NodePath titlePath = pathParser.toPath(titlePathValue);
    HTMLNode titleNode = htmlExtractor.lookNode(document.getRoot(), titlePath);
    String titleThread =  buildText(titleNode);
   
    String [] postPathValues = {
        "BODY[0].DIV[1].TABLE[1].TBODY[0].TR[0].TD[0].DIV[1].DIV[0].DIV[0].TABLE[0]",
        "BODY[0].DIV[1].TABLE[1].TBODY[0].TR[0].TD[0].TABLE[*].TBODY[0].TR[0].TD[1].DIV[0].DIV[0].DIV[0].TABLE[0]"
    };
   
    NodePath [] postPaths = pathParser.toNodePath(postPathValues);
    HTMLDocument document2 = htmlExtractor.extract(document, postPaths);
   
    String userPathValue = "TABLE[*].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].NOBR[0].A[0]";
    NodePath userPath = pathParser.toPath(userPathValue);
    List<HTMLNode> userNodes = htmlExtractor.matchNodes(document2.getRoot(), userPath);
   
    List<String> users = new ArrayList<String>();
    for(HTMLNode userNode : userNodes) {
      users.add(buildText(userNode));
    }
   
    String textPostPathValue = "TABLE[*].TBODY[0].TR[0].TD[1].TABLE[0].TBODY[0].TR[1]";
    NodePath textPostPath = pathParser.toPath(textPostPathValue);
    List<HTMLNode> textPostNodes = htmlExtractor.matchNodes(document2.getRoot(), textPostPath);

    List<String> posts = new ArrayList<String>();
    for(HTMLNode textPostNode : textPostNodes) {
      posts.add(buildText(textPostNode));
View Full Code Here

TOP

Related Classes of org.vietspider.html.path2.NodePathParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.