Package com.zhangwoo.spider.po

Examples of com.zhangwoo.spider.po.UrlRequest


   * 此处接收客户端请求URL信息,返回单个URL
   */
  @Override
  public void messageReceived(IoSession session, Object message)
      throws Exception {
    UrlRequest url = TaskCenter.getUrl();
    session.write(url==null?"":url);
  }
View Full Code Here


  /**
   * 请求单个URL
   */
  public static UrlRequest getUrl() {
    // logger.debug("now need poll , size "+TaskCenter.taskQueue.size());
    UrlRequest urlReq = TaskCenter.taskQueue.poll();
    if (urlReq!=null) {
      UrlState state = new UrlState(urlReq);
      state.setBeginTime(DateUtil.formatDateTime());
      urlState.add(state); // 正在运行的任务
    }
View Full Code Here

        TaskCenter.taskState.put(task, new ArrayList<UrlState>());
        TaskCenter.taskCount.put(task, 0);
        new TaskDao().updateTaskUpdatetime(task);
        logger.info("run task : " + task.getTname() + " url : "
            + task.getTurl());
        UrlRequest urlReq = new UrlRequest(task.getTurl(), task);
        urlReqs.add(urlReq);
      }
    }
    if (urlReqs.size() > 0) {
      new TaskDao().updateTaskNextrun(ts); // 更新任务启动时间
View Full Code Here

    codeProcess();
  }

  public void codeProcess() {
    // 获得request
    UrlRequest urlReq = ClientMessageCenter.getInstance().urlGet();
    if (urlReq == null || StringUtil.isEmpty(urlReq.getUrl()))
      return;

    // *+* 此处首先获取模板,若无模板则无需浪费时间与流量,直接放弃该链接。
    List<AnalyserTemplate> templates = null;
    try {
      templates = this.findAnalyserTemplate(urlReq);
    } catch (Exception e) {
      logger.error(e);
    }
    if (templates == null || templates.size() == 0)
      return;

    logger.info("running url : " + urlReq.getUrl());

    // 发送request
    String html = null;
    for (int i = 0; i < 3; i++) { // 最多发送3次请求
      try {
        html = sendRequest(urlReq);
        if (!StringUtil.isEmpty(html))
          break;
      } catch (Exception e) {
        logger.error("send request url : " + urlReq.getUrl()
            + " error! ", e);
      }
      try {
        Thread.sleep(1000);
      } catch (InterruptedException e) {
        logger.error("thread sleep error! ", e);
      }
    }

    List<Conversation> convsResult = new ArrayList<Conversation>();
    List<UrlRequest> urlReqResult = new ArrayList<UrlRequest>();

    if (!StringUtil.isEmpty(html)) { // html 若无内容  则无需分析
      // 一个个分析模板过程,单个页面可能分析出多组数据
      for (AnalyserTemplate template : templates) {
        if (!template.isMatchTemplate(urlReq, html))
          continue;

        Document docHtml = null;
        try {
          docHtml = XmlUtil.formatToDoc(html);
        } catch (Exception e) {
          logger.error("html fromat to doc error! " + html, e);
        }

        List<Conversation> convsTemp = template.findConversations(
            urlReq, html, docHtml);
        if (convsTemp != null)
          for (Conversation convs : convsTemp) {
            if (convs.getSaveable().equals("1")) {
              convsResult.add(convs);
            }
            if (convs.getRunable().equals("1"))
              urlReqResult.add(new UrlRequest(formatUrl(
                  convs.getSelfLink(), urlReq), urlReq
                  .getTask()));
          }
        urlReqResult.addAll(template.findLinks(urlReq, html, docHtml));
      }

      if (urlReqResult != null && urlReqResult.size() > 0) {
        ClientMessageCenter.getInstance().urlSend(urlReqResult);
        logger.info("url : " + urlReq.getUrl() + " get next url : "
            + urlReqResult.size());
      }
      if (convsResult != null && convsResult.size() > 0) {
        ClientMessageCenter.getInstance().resultSend(convsResult);
        logger.info("url : " + urlReq.getUrl() + " get convsResult : "
            + convsResult.size());
      }
    }
    if ((urlReqResult == null || urlReqResult.size() == 0) && (convsResult == null || convsResult.size() == 0)) {
      logger.info("url : " + urlReq.getUrl() + " get no reuslt!  ");
    }
    ClientMessageCenter.getInstance().stateSend(
        new UrlState(html.length(), urlReqResult.size(), convsResult
            .size(), urlReq));
  }
View Full Code Here

  public List<UrlRequest> findLinks(UrlRequest urlReq, String html,
      Document docHtml) {
    List<UrlRequest> links = new ArrayList<UrlRequest>();
    try {
      UrlRequest urlTemp = null;

      NodeList linksNode = (NodeList) xpath.evaluate("LINKS/LINK",
          this.templateNode, XPathConstants.NODESET);
      if (linksNode != null && linksNode.getLength() > 0) {
        for (int linki = 0; linki < linksNode.getLength(); linki++) {
          String linkRegExp = xpath.evaluate("@regexp",
              linksNode.item(linki));
          String linkStopRegExp = xpath.evaluate("@stopregexp",
              linksNode.item(linki));
          String linkStopXpath = xpath.evaluate("@stopxpath",
              linksNode.item(linki));

          List<String[]> linkUrls = StringUtil.matchAll(html, linkRegExp);
          List<String[]> linkStops = null;
          if(!StringUtil.isEmpty(linkStopRegExp))
            linkStops = StringUtil.matchAll(html, linkStopRegExp);
          else if(!StringUtil.isEmpty(linkStopXpath)){
            linkStops=new ArrayList<String[]>();
            linkStops.add(new String[]{"",xpath.evaluate(linkStopXpath, docHtml).trim()});
          }

          for (int i = 0; i < linkUrls.size(); i++) {
            String linkUrl = linkUrls.get(i)[1];
            String linkStop = null;
            if(linkStops!=null&&linkStops.size()>0){
              if(i>linkStops.size()-1)
                linkStop=linkStops.get(linkStops.size()-1)[1];
              else
                linkStop=linkStops.get(i)[1];
            }
            if (!StringUtil.isEmpty(linkStop)
                && urlReq.getTask() != null
                && !StringUtil.isEmpty(urlReq.getTask()
                    .getUpdatetime())) {
              if (DateUtil.diffDate(DateUtil.SECOND, DateUtil
                  .stringToDate(urlReq.getTask()
                      .getUpdatetime()), DateUtil
                  .stringToDate(linkStop)) < 0) {
                continue;
              }
            }
            if (StringUtil.isEmpty(linkUrl))
              continue;
            urlTemp = (UrlRequest) urlReq.clone();
            urlTemp.setHeader(null);
            urlTemp.setUrl(SpiderThread.formatUrl(linkUrl, this.urlReq));
            links.add(urlTemp);
          }
        }
      }
    } catch (Exception e) {
View Full Code Here

      return links;
    }
   
    int nowPage=Integer.valueOf(StringUtil.match(urlReq.getUrl(),"page=(\\d+)")[1]);
    ++nowPage;
    links.add(new UrlRequest(
        "http://product.dangdang.com/comment/main.php?product_id"+urlReq.getUrl().substring(urlReq.getUrl().indexOf("="))+"&page="+nowPage+"&filtertype=1&type=part",
        urlReq.getTitle(),
        urlReq.getTask()));
    return links;
  }
View Full Code Here

  }

  public List<UrlRequest> findLinks(UrlRequest urlReq, String html,
      Document docHtml) {
    List<UrlRequest> links = new ArrayList<UrlRequest>();
    links.add(new UrlRequest(
        "http://product.dangdang.com/comment/main.php?product_id="+StringUtil.match(urlReq.getUrl(), "product_id=(\\d+)")[1]+"&page=1&filtertype=1&type=part",
        StringUtil.match(html, "<title>(.*)</title>")[1],
        urlReq.getTask()));
    return links;
  }
View Full Code Here

TOP

Related Classes of com.zhangwoo.spider.po.UrlRequest

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.