Package org.vietspider.html

Examples of org.vietspider.html.HTMLDocument


    saveData(name);
  }

  public void saveData(String name) throws Exception {
    if(data == null || data.length < 1) return;
    HTMLDocument doc = new HTMLParser2().createDocument(data, "utf-8");
    if(this.addPaths == null) return;
    doc = htmlExtractor.extract(doc, this.addPaths);
    if(removePaths != null && removePaths.length > 0){
      htmlExtractor.remove(doc.getRoot(), removePaths);
    }
    data = doc.getTextValue().getBytes("utf-8");
    if(data.length < 1) return;
    File file = new File(name);
    FileOutputStream output = new FileOutputStream(file);
    output.write(data);
    output.flush();
View Full Code Here


    if(!childrenThread[0].isComplete()) return ;
    if(childrenThread == null) return;
    if(childrenThread[0] == null) return;
    byte [] data = childrenThread[0].getData();
    if(data == null || data.length < 0) return;
    HTMLDocument document =  new HTMLParser2().createDocument(data, "utf-8");
    if(homePath != null) {
      document = htmlExtractor.extract(document, new NodePath[]{homePath});
    }
    linkUtil.createFullNormalLink(document.getRoot(), new URL(url));
    links  = linkUtil.scanSiteLink(document.getRoot());
    idx = 0;
  }
View Full Code Here

    URL url = new URL(address);
    String ref = url.getRef();
    if(ref != null && (ref = ref.trim()).isEmpty())  ref = null;
    if(ref != null) address = address.substring(0, address.indexOf('#'));
   
    HTMLDocument document = null;
    if(address.startsWith("file")){
      file = new File(url.toURI());     
      document = detector.loadDocument(file);
    } else {
      byte[] obj = loadContent(refer, address);      
      if( obj == null || obj.length < 1) return null;
      document = detector.createDocument(obj);
      chars = document.getTextValue().toCharArray();
      if(cache) cacheResponse(address, chars);
    }
   
    return document == null || ref == null ?
        document : new HTMLAnchorUtil().searchDocument(document, ref);
View Full Code Here

*/
public class JavascriptExtractor {

  public static void main(String[] args) throws Exception {
    URL url = new URL("http://java.sun.com/");
    HTMLDocument document = new HTMLParser2().createDocument(url.openStream(), "utf-8");
   
    NodeIterator iterator =  document.getRoot().iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(node.isNode(Name.SCRIPT)) {
        if(node.hasChildren() && node.getChildren().size() > 0) {
          System.out.println("===================================================");
View Full Code Here

    builder.append("<h4>Địa chỉ gửi link liên kết đến bài viết này</h4>");
   
    List<String> jsScripts = new ArrayList<String>();
    jsScripts.add("show_postcontent");
   
    HTMLDocument document = new HTMLParser2().createDocument(builder.toString());
    JsHandler.updateDocument(document, jsScripts);
    System.out.println(document.getTextValue());
  }
View Full Code Here

  void setAfterLogin(List<NodeImpl> tokens) throws Exception {
    afters = build(tokens);
  }
 
  private String [] build(List<NodeImpl> tokens) throws Exception {
    HTMLDocument doc = new HTMLParser2().createDocument(tokens);
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor extractor  = new HTMLExtractor();
   
    NodePath nodePath  = pathParser.toPath("BODY");
    HTMLNode body = extractor.lookNode(doc.getRoot(), nodePath);
   
    TextRenderer renderer = new TextRenderer(body, TextRenderer.HANDLER);
    String value = renderer.getTextValue().toString();
    return value.trim().split("\n");
  }
View Full Code Here

  public HTMLDocument extract(HTMLDocument document, NodePath... nodePaths) {
    HTMLNode root = document.getRoot();
//    CharsToken tokens = document.getTokens();
   
    HTMLNode newRoot = HTMLParser2.clone(root);
    HTMLDocument newDocument  = new HTMLDocument();
//    CharsToken newTokens = new CharsToken(newDocument);
//    newTokens.push((NodeImpl)newRoot);

    LookupNode lookupNode = new LookupNode();
   
    for(int i = 0; i < nodePaths.length; i++) {
      List<HTMLNode> htmlNodes = lookupNode.lookupNodes(root, nodePaths[i]);
      if(htmlNodes == null ) continue;
      for(HTMLNode htmlNode : htmlNodes) {
        if(htmlNode == null) continue;
//        lookupNode.extractTokens(tokens, newTokens, htmlNode);
        htmlNode.clone(newRoot);
      }
    }
   
    newDocument.setRoot(newRoot);
    return newDocument;
  }
View Full Code Here

          nextNodes.get(i).clone(newRoot);
        } catch (Exception e) {
          continue;
        }
      }
      newDocuments[i] = new HTMLDocument(newRoot);
    }
    return newDocuments;
  }
View Full Code Here

  public HTMLDocument extract(HTMLDocument document, NodePath... nodePaths) {
    HTMLNode root = document.getRoot();
//    CharsToken tokens = document.getTokens();
   
    HTMLNode newRoot = HTMLParser2.clone(root);
    HTMLDocument newDocument  = new HTMLDocument();
//    CharsToken newTokens = new CharsToken(newDocument);
//    newTokens.push((NodeImpl)newRoot);

    for(int i = 0; i < nodePaths.length; i++) {
      List<HTMLNode> htmlNodes = matchNodes(root, nodePaths[i]);
      if(htmlNodes == null ) continue;
      for(HTMLNode htmlNode : htmlNodes) {
        if(htmlNode == null) continue;
//        extractTokens(tokens, newTokens, htmlNode);
        newRoot.addChild(htmlNode);
//        htmlNode.setParent(newRoot);
      }
    }
   
    newDocument.setRoot(newRoot);
    return newDocument;
  }
View Full Code Here

//          newHtmlValues.get(i).setParent(html);
        } catch (Exception e) {
          continue;
        }
      }
      newDocuments[i] = new HTMLDocument(html);
    }
    return newDocuments;
  }
View Full Code Here

TOP

Related Classes of org.vietspider.html.HTMLDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.