Examples of org.vietspider.html.path2.INode

org.vietspider.html.path2.NodePath
Author : Nhu Dinh Thuan nhudinhthuan@yahoo.com Dec 6, 2007

  public HTMLDocument searchDocument(HTMLDocument document, String name) {
    HTMLNode root = document.getRoot();
    HTMLNode newRoot = searchDocument(root, name);
    if (root == newRoot) return document;
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor extractor = new HTMLExtractor();
    try {
      HTMLNode body = extractor.lookNode(root, pathParser.toPath("BODY"));
      body.clearChildren();
      body.addChild(newRoot);
    } catch (Exception e) {
      LogService.getInstance().setThrowable(e);
    }

View Full Code Here

    
    WebClient webClient = new WebClient();
    webClient.setURL(null, url);
    
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor htmlExtractor = new HTMLExtractor();
    
    byte  [] bytes = download(webClient, "http://news.google.com.vn/");
    
    HTMLDocument document = new HTMLParser2().createDocument(bytes, null);
    
    String [] paths = {
        "BODY[0].TABLE[2].TBODY[0].TR[0].TD[3].TABLE[1].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].DIV[0].TABLE[0].TBODY[0].TR[0].TD[0].DIV[*]"
    };
    
    NodePath [] nodePaths = new NodePath[paths.length];
    for(int i=0; i<paths.length; i++){
      nodePaths[i] = pathParser.toPath(paths[i]);
    }
    
    HTMLDocument doc = htmlExtractor.extract(document, nodePaths);
    System.out.println(doc.getTextValue());
    
    paths = new String[]{
        "DIV[*].BR[*]",
    };
    
    nodePaths = new NodePath[paths.length];
    for(int i=0; i<paths.length; i++){
      nodePaths[i] = pathParser.toPath(paths[i]);
    }
    
    htmlExtractor.remove(doc.getRoot(), nodePaths);
    
    System.out.println(doc.getRoot().getTextValue());
    
    File file = new File("a.html");
    byte[] data = doc.getTextValue().getBytes();

View Full Code Here

  public void viewNode(String path) throws Exception {
    HTMLParser2 parser2 = new HTMLParser2();
    HTMLDocument document  = parser2.createDocument(file, null);
    NodePathParser pathParser = new NodePathParser();
    NodePath nodePath = pathParser.toPath(path);
    HTMLExtractor htmlExtractor = new HTMLExtractor();
    HTMLNode node = htmlExtractor.extract(document, new NodePath[]{nodePath}).getRoot();
    System.out.println(node.getTextValue());
  }

View Full Code Here

      HTMLParser2 parser2 = new HTMLParser2();
      HTMLDocument document = parser2.createDocument(url.openStream(), "utf-8");


      NodePathParser pathParser = new NodePathParser();
      NodePath nodePath = pathParser.toPath("BODY[0].DIV[0].TABLE[0].TBODY[0].TR[1].TD[3].DIV[10]");
      HTMLExtractor htmlExtractor = new HTMLExtractor();
      HTMLNode node = htmlExtractor.extract(document, new NodePath[]{nodePath}).getRoot();


      System.out.println(node.getTextValue());


      System.out.println("=================================================================");

View Full Code Here

    
    NodePathParser pathParser = new NodePathParser();
    
    NodePath nodePath = pathParser.toPath(path);
    
    HTMLExtractor htmlExtractor = new HTMLExtractor();
    document = htmlExtractor.extract(document, new NodePath[]{nodePath});
    
    List<HTMLNode> children = document.getRoot().getChildren();
    
    //print header
    List<String> headers = new ArrayList<String>();

View Full Code Here

    HTMLParser2 parser2 = new HTMLParser2();
    HTMLDocument document  = parser2.createDocument(bytes, "utf-8");
    String titlePathValue  = "BODY[0].DIV[1].TABLE[0].TBODY[0].TR[0].TD[0].P[1]";
    
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor htmlExtractor = new HTMLExtractor();
    
    NodePath titlePath = pathParser.toPath(titlePathValue);
    HTMLNode titleNode = htmlExtractor.lookNode(document.getRoot(), titlePath);
    String titleThread =  buildText(titleNode);
    
    String [] postPathValues = {
        "BODY[0].DIV[1].TABLE[1].TBODY[0].TR[0].TD[0].DIV[1].DIV[0].DIV[0].TABLE[0]",
        "BODY[0].DIV[1].TABLE[1].TBODY[0].TR[0].TD[0].TABLE[*].TBODY[0].TR[0].TD[1].DIV[0].DIV[0].DIV[0].TABLE[0]"
    };
    
    NodePath [] postPaths = pathParser.toNodePath(postPathValues);
    HTMLDocument document2 = htmlExtractor.extract(document, postPaths);
    
    String userPathValue = "TABLE[*].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].NOBR[0].A[0]";
    NodePath userPath = pathParser.toPath(userPathValue);
    List<HTMLNode> userNodes = htmlExtractor.matchNodes(document2.getRoot(), userPath);
    
    List<String> users = new ArrayList<String>();
    for(HTMLNode userNode : userNodes) {
      users.add(buildText(userNode));
    }
    
    String textPostPathValue = "TABLE[*].TBODY[0].TR[0].TD[1].TABLE[0].TBODY[0].TR[1]";
    NodePath textPostPath = pathParser.toPath(textPostPathValue);
    List<HTMLNode> textPostNodes = htmlExtractor.matchNodes(document2.getRoot(), textPostPath);


    List<String> posts = new ArrayList<String>();
    for(HTMLNode textPostNode : textPostNodes) {
      posts.add(buildText(textPostNode));
    }

View Full Code Here

            }
          }
          continue;
        }


        Node node = (Node)inode;
        if(node.getIndex() != index) continue;
        Attribute [] attrs = inode.getAttributes(); 
        if(attrs == null || attrs.length < 1) {
          list.add(item);
        } else {
          String data = (String)item.getData();

View Full Code Here

      if(item.getText().startsWith(name)){
        String indexValue = item.getText().substring(name.length());
        int index = Integer.parseInt(indexValue);
        
        if(inode instanceof NodeExp) {
          NodeExp nodeExp = (NodeExp)inode;
          if(expMatcher.match(nodeExp.getPattern(), index)) {
            Attribute [] attrs = nodeExp.getAttributes(); 
            if(attrs == null || attrs.length < 1) {
              list.add(item);
            } else {
              String data = (String)item.getData();
              int idx = data.indexOf(' ');

View Full Code Here

  }
  
  private List<TreeItem> selectNode(INode inode, TreeItem [] items){
    String name  = inode.getName()+"-";
    List<TreeItem>  list = new ArrayList<TreeItem>();
    NodeMatcher expMatcher = new NodeMatcher();
    for(TreeItem item : items){
      if(item.getText().startsWith(name)){
        String indexValue = item.getText().substring(name.length());
        int index = Integer.parseInt(indexValue);
        
        if(inode instanceof NodeExp) {
          NodeExp nodeExp = (NodeExp)inode;
          if(expMatcher.match(nodeExp.getPattern(), index)) {
            Attribute [] attrs = nodeExp.getAttributes(); 
            if(attrs == null || attrs.length < 1) {
              list.add(item);
            } else {
              String data = (String)item.getData();

View Full Code Here

    ContentRegionSearcher2 searcher = new ContentRegionSearcher2();
    HTMLNode nodes = searcher.extractContent(document, url, false);


    NodePathParser pathParser = new NodePathParser();
    //    for(int i = 0; i < nodes.size(); i++) {
    NodePath path = pathParser.toPath(nodes);   
    if(path == null) return; 
    short selectType = PathConfirmDialog.YES;
    handler.traverseTree(tree, path, TreeHandler.MARK, selectType);
    //    }
  }

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of org.vietspider.html.path2.INode

org.apache.hadoop.fs.Path

org.apache.hadoop.fs.s3.Block

org.apache.tajo.storage.s3.S3OutputStream

org.apache.tajo.storage.s3.SmallBlockS3FileSystem

org.jnode.fs.ext4.ExtentHeader

org.vietspider.browser.LoginUtils

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.