Package org.eweb4j.spiderman.xml

Examples of org.eweb4j.spiderman.xml.Target


//    System.out.println("parse.next->"+page.getUrl());
    Model mdl = rule.getNextPage();
    if (mdl == null)
      return ;
   
    Target tgt = new Target();
    tgt.setName(target.getName());
    tgt.setModel(mdl);
   
    //解析Model获得next URL
//    System.out.println("page--!!!!!!----->"+page.getUrl());
    Collection<String> nextUrls = UrlUtils.digUrls(page, task, rule, tgt, listener, finalFields);
//    System.out.println("visitedUrls-->>>>>>>>>>>>!!!!!!!!!!!!!!" + visitedUrls);
//    System.out.println("nextUrls-->>>>>>>>>>>>!!!!!!!!!!!!!!" + nextUrls);
    if (nextUrls == null || nextUrls.isEmpty())
      return ;
    String nextUrl = new ArrayList<String>(nextUrls).get(0);
    if (nextUrl == null || nextUrl.trim().length() == 0)
      return ;
   
    if (visitedUrls.contains(nextUrl)){
      return ;
    }
   
    FetchRequest req = new FetchRequest();
    req.setUrl(nextUrl);
    FetchResult fr = task.site.fetcher.fetch(req);
    if (fr == null || fr.getPage() == null)
      return ;
   
    //记录已经访问过该url,下次不要重复访问它
    visitedUrls.add(nextUrl);
   
    //解析nextPage
    Task nextTask = new Task(nextUrl, task.url, task.site, 0);
    Model nextModel = new Model();
    List<Field> isAlsoParseInNextPageFields = target.getModel().getIsAlsoParseInNextPageFields();
    if (isAlsoParseInNextPageFields == null || isAlsoParseInNextPageFields.isEmpty())
      return ;
   
    nextModel.getField().addAll(isAlsoParseInNextPageFields);
    tgt.setModel(nextModel);
   
    ModelParser parser = new ModelParser(nextTask, tgt, listener);
    Page nextPageResult = fr.getPage();
    List<Map<String, Object>> nextMaps = parser.parse(nextPageResult);
    if (nextMaps == null)
View Full Code Here


  public synchronized Collection<Task> sortTasks(Collection<Task> tasks) throws Exception {
    double i = 0;
    for (Task task : tasks) {
      i += 0.0001;
      // 检查url是否符合target的url规则,并且是否是来自于来源url
      Target tgt = Util.isTargetUrl(task);
      boolean isFromSourceUrl = SourceUrlChecker.checkSourceUrl(site.getTargets().getSourceRules(), task.sourceUrl);
//      System.out.println(tgt+", isFrom->"+isFromSourceUrl+", url->"+task.url+", source->"+task.sourceUrl);
      if (tgt != null && isFromSourceUrl){
        task.sort = 0 + CommonUtil.toDouble("0."+System.currentTimeMillis()) + i;
      }else{
View Full Code Here

   
    Collection<Task> validTasks = new ArrayList<Task>();
    for (String url : newUrls){
      Task newTask = new Task(url, task.url, site, 10);
      try {
        Target tgt = Util.isTargetUrl(newTask);
        boolean isFromSourceUrl = SourceUrlChecker.checkSourceUrl(site.getTargets().getSourceRules(), newTask.sourceUrl);
        //如果是目标url,但不是来自来源url,跳过
        if (tgt != null && !isFromSourceUrl){
          continue;
        }
View Full Code Here

        }
       
        boolean isValid = false;
        try {
          //如果是目标url且是从sourceUrl来的,就是有效的
          Target tgt = Util.isTargetUrl(task);
          boolean isFromSourceUrl = SourceUrlChecker.checkSourceUrl(site.getTargets().getSourceRules(), task.sourceUrl);
          if (tgt != null && isFromSourceUrl){
            isValid = true;
          }
//          System.out.println("isFromSourceUrl->"+isFromSourceUrl+", isTgt->"+tgt==null+", url->"+task.url);
View Full Code Here

        if (!isSourceUrl)
          continue;
       
        Map<String, Object> finalFields = new HashMap<String,Object>();
       
        Target tgt = new Target();
        tgt.setName("dig_urls");
        tgt.setModel(digModel);
        Collection<String> newUrls = UrlUtils.digUrls(result.getPage(), task, r, tgt, listener, finalFields);       
//        System.out.println("digUrls 得到:"+newUrls.size() + " ----->  " + newUrls);
        //解析Model获得urls
        urls.addAll(newUrls);
       
View Full Code Here

//    System.out.println("parse.next->"+page.getUrl());
    Model mdl = rule.getNextPage();
    if (mdl == null)
      return ;
   
    Target tgt = new Target();
    tgt.setName("dig_urls");
    tgt.setModel(mdl);
   
    //解析Model获得next URL
//    System.out.println("page--!!!!!!----->"+page.getUrl());
    Collection<String> nextUrls = UrlUtils.digUrls(page, task, rule, tgt, listener, finalFields);
//    System.out.println("visitedUrls-->>>>>>>>>>>>!!!!!!!!!!!!!!" + visitedUrls);
View Full Code Here

TOP

Related Classes of org.eweb4j.spiderman.xml.Target

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.