Package org.vietspider.html.util

Examples of org.vietspider.html.util.HTMLParserDetector


    if(file.exists()) return;
    if(level == depth) return ;
    System.out.println("\nstart download "+ address +" level "+level +" depth "+depth +" ...");
    byte [] bytes = download(parent.toString(), address);
    if(bytes == null || bytes.length < 0) return;
    HTMLParserDetector parser = new HTMLParserDetector();
    String charset = parser.detectCharset(bytes);
    char [] chars = CharsDecoder.decode(charset, bytes, 0, bytes.length);

    List<NodeImpl> tokens = parser.createTokens(chars);

    List<Resource> resources = new ArrayList<Resource>();
//    resources.add(new Resource("img", "src"));
//    resources.add(new Resource("link", "href"));
//    resources.add(new Resource("script", "src"));
View Full Code Here


        }
      }

      public void execute() {
        try {
          HTMLParserDetector detector = new HTMLParserDetector();
          detector.setDecode(decode);
          detector.setCharset(charset);
         
          if(html == null || html.trim().isEmpty()) {
            document = webClient.createDocument(refer, url, cache, detector);
          } else {
//            System.out.println(html);
            document = detector.createDocument(html);
          }
          if(charset == null) charset = detector.getCharset();
          removeIFrameSource(document.getRoot());
        } catch(Exception exp) {
          ClientLog.getInstance().setException(null, exp);
        }  
      }
View Full Code Here

    if(data == null) {
      message = "Error: Not html data!";
      return ERROR;
    }
    if(charset == null) {
      HTMLParserDetector detector = new HTMLParserDetector();
      charset = detector.detectCharset(data);
    }
   
    char [] chars = CharsDecoder.decode(charset, data, 0, data.length);
    return post(referer, formName, url, chars, abort);
  }
View Full Code Here

    this.errorDetector = errorDetector;
  }
 
  private boolean checkTimeout(byte [] data) throws Exception {
    if(data == null) return true;
    HTMLParserDetector parser = new HTMLParserDetector();
    if(charset == null) charset = parser.detectCharset(data);
   
    char [] chars = CharsDecoder.decode(charset, data, 0, data.length);
    List<NodeImpl> tokens  = parser.createTokens(chars);
    if(tokens == null) return true;

    for(int i = 0; i < tokens.size(); i++) {
      NodeImpl node = tokens.get(i);
      if(!node.isNode(Name.INPUT)) continue;
View Full Code Here

    if(data == null) {
      message = "Not html data!";
      return ERROR;
    }
    if(charset == null) {
      HTMLParserDetector detector = new HTMLParserDetector();
      charset = detector.detectCharset(data);
    }
   
    char [] chars = CharsDecoder.decode(charset, data, 0, data.length);
    return post(referer, url, chars);
  }
View Full Code Here

    }
   
    HttpResponseReader responseReader = HttpHandlers.getInstance().createReader();
    byte [] data = responseReader.readBody(response);
   
    HTMLParserDetector htmlParser2 = new HTMLParserDetector();
    if(data == null) return false;
    if(charset == null) {
      charset = htmlParser2.detectCharset(data);
    }
   
    char [] chars = CharsDecoder.decode(charset, data, 0, data.length);
    List<NodeImpl> tokens  = htmlParser2.createTokens(chars);
    if(tokens == null) return false;

    int start = searchPasswordField(tokens);
    LoginUtils loginUtil = new LoginUtils();
   
    if(start == -1) {
      throw new UnknownHostException("Not found login form. Please check login address: "+loginUrl);
    }

    for(; start > -1; start--) {
      NodeImpl node = tokens.get(start);
      if(node.isNode(Name.FORM)) break;
    }

    HTMLNode form = null;
    boolean md5 = false;
    List<HTMLNode> inputs = new ArrayList<HTMLNode>();
    String formValue = null;

    for(int i = start; i < tokens.size(); i++) {
      NodeImpl node = tokens.get(i);
      if(node.isNode(Name.FORM)) {
        if(node.isOpen()) {
          if(!md5) {
            String value = new String(node.getValue());
            md5 = value.toLowerCase().indexOf("md5") > -1;
          }
          form = node;
          formValue = new String(form.getValue()).toLowerCase();
        } else {
          break;
        }
      } else if(node.isNode(Name.INPUT)) {
        if(!md5 && formValue != null) {
          md5 = formValue.indexOf("md5") > -1;
        }
        inputs.add(node);
      }
    }
    if(form == null || inputs.size() < 1) return false;
    String address = getAttribute(form, ACTION);
    if(address == null) return false;
   
    List<NameValuePair> params = new ArrayList<NameValuePair>();
    for(int i = 0 ; i < inputs.size(); i++) {
      Attributes attrs = inputs.get(i).getAttributes();
      String name = getAttribute(attrs, NAME_ATTR);
      if(name == null) getAttribute(attrs, ID_ATTR);
      if(name == null) continue;
      String type = getAttribute(attrs, TYPE_ATTR);
      if(type == null) type = "text";
      if(type.equalsIgnoreCase("text")) {
//        System.out.println(name + " : "+ username);
        params.add(new BasicNameValuePair(name, username));
      } else if(type.equalsIgnoreCase(PASSWORD)) {
        if(md5) password = hexMd5(password);
//        System.out.println(name + " : "+ password);
//        params.add(new BasicNameValuePair(name, password));
        //@TODO Hashcode
//        System.out.println(formValue+ " : " + formValue.indexOf("vb_login_md5password"));
        if(formValue.indexOf("vb_login_md5password") > -1) {
          params.add(new BasicNameValuePair(name, ""));
          params.add(new BasicNameValuePair("vb_login_md5password", password));
          params.add(new BasicNameValuePair("vb_login_md5password_utf", password));
        else {
          params.add(new BasicNameValuePair(name, password));
        }
      } else {
        String value = "";
       
        ParamValue paramValues = maps.get(name);
        if(paramValues == null) {
          value = getAttribute(attrs, VALUE);
        } else {
          value = paramValues.getValues()[0];
        }
       
        if(value != null) {
//          System.out.println(name + " : "+ value);
          params.add(new BasicNameValuePair(name, value == null ? "" : value.trim()));
        }
      }
    }
   
    URLUtils urlUtils = new URLUtils();
    address = urlUtils.createURL(loginUrl, address).trim();
    address = urlUtils.getCanonical(address);
   
    referer = loginUrl.toString();
    WebClient webClient = httpMethod.getWebClient();
    HttpHost httpHost = webClient.createHttpHost(address);
    HttpPost httpPost = webClient.createFormPostMethod(address, referer, params);
    HttpResponse response2 = httpMethod.execute(httpHost, httpPost);
   
    /*statusLine = response2.getStatusLine();
    statusCode = statusLine.getStatusCode();
    if(statusCode == HttpStatus.SC_MOVED_PERMANENTLY
        || statusCode == HttpStatus.SC_SEE_OTHER
        || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT) {
      Header header = response2.getFirstHeader("Location");
      if(header != null
          && header.getValue() != null
          && !header.getValue().trim().isEmpty()) {
        System.out.println(header.getValue());
        try {
          response2 = httpMethod.execute(header.getValue(), null);
        } catch (Throwable e) {
          e.printStackTrace();
        }
        System.out.println(" da xong roi ");
      }
    }*/
   
//    System.out.println("status code "+ statusCode);
//    Header [] headers = response2.getAllHeaders();
//    for(Header header : headers) {
//      System.out.println(header.getName() + " : " + header.getValue());
//    }

    byte [] bytes = new byte[0];
//    org.vietspider.common.io.DataWriter writer = new org.vietspider.common.io.DataWriter();
//    java.io.File file = UtilFile.getFile("track/logs/", "login_" + Utils.toFileName(address));

    loginUtil.setPrevLogin(tokens);
    boolean error  = false;
    try {
      bytes = responseReader.readBody(response2);
    } catch (SocketException e) {
      LogService.getInstance().setMessage("WEB", e, e.toString());
      bytes = e.toString().getBytes();
      error = true;
    } catch (Exception e) {
      LogService.getInstance().setThrowable("WEB", e);
      bytes = e.toString().getBytes();
      error = true;
    }
   
    if(webClient.isLog()) {
      java.io.File file = UtilFile.getFile("track/logs/", "login_" + Utils.toFileName(address));
      LogService.getInstance().setMessage("WEB", null, "Login to: " +address);
      new org.vietspider.common.io.DataWriter().save(file, bytes);
    }
   
    if(bytes == null || bytes.length < 1) {
      error = true;
      throw new Exception("No response from server.");
    }

    if(!error) {
      chars = CharsDecoder.decode(charset, bytes, 0, bytes.length);
      tokens  = htmlParser2.createTokens(chars);
      if(tokens == null) return  false;
      loginUtil.setAfterLogin(tokens);
      if(searchPasswordField(tokens) > 0) throw new Exception(loginUtil.getError());
    }

View Full Code Here

    return createDocument(text.toCharArray());
  }

  public HTMLDocument createDocument(byte[] data, String charset) throws Exception {
    if(charset == null) {
      HTMLParserDetector parserDetector = new HTMLParserDetector();
      return parserDetector.createDocument(data);
    }
    char [] chars = CharsDecoder.decode(charset, data, 0, data.length);
    return createDocument(chars);
 
View Full Code Here

    if(file.exists()) return;
    if(level == depth) return ;
    System.out.println("\nstart download "+ address +" level "+level +" depth "+depth +" ...");
    byte [] bytes = download(parent.toString(), address);
    if(bytes == null || bytes.length < 0) return;
    HTMLParserDetector parser = new HTMLParserDetector();
    String charset = parser.detectCharset(bytes);
    char [] chars = CharsDecoder.decode(charset, bytes, 0, bytes.length);

    List<NodeImpl> tokens = parser.createTokens(chars);

    List<Resource> resources = new ArrayList<Resource>();
//    resources.add(new Resource("img", "src"));
//    resources.add(new Resource("link", "href"));
//    resources.add(new Resource("script", "src"));
View Full Code Here

TOP

Related Classes of org.vietspider.html.util.HTMLParserDetector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.