Package org.eweb4j.spiderman.xml

Examples of org.eweb4j.spiderman.xml.Model


  }

  //递归的额关键是 Page
  public void parseNextPage(Rule rule, Target target, Task task, Page page, List<Map<String, Object>> results, Set<String> visitedUrls, Map<String,Object> finalFields) throws Exception{
//    System.out.println("parse.next->"+page.getUrl());
    Model mdl = rule.getNextPage();
    if (mdl == null)
      return ;
   
    Target tgt = new Target();
    tgt.setName(target.getName());
    tgt.setModel(mdl);
   
    //解析Model获得next URL
//    System.out.println("page--!!!!!!----->"+page.getUrl());
    Collection<String> nextUrls = UrlUtils.digUrls(page, task, rule, tgt, listener, finalFields);
//    System.out.println("visitedUrls-->>>>>>>>>>>>!!!!!!!!!!!!!!" + visitedUrls);
//    System.out.println("nextUrls-->>>>>>>>>>>>!!!!!!!!!!!!!!" + nextUrls);
    if (nextUrls == null || nextUrls.isEmpty())
      return ;
    String nextUrl = new ArrayList<String>(nextUrls).get(0);
    if (nextUrl == null || nextUrl.trim().length() == 0)
      return ;
   
    if (visitedUrls.contains(nextUrl)){
      return ;
    }
   
    FetchRequest req = new FetchRequest();
    req.setUrl(nextUrl);
    FetchResult fr = task.site.fetcher.fetch(req);
    if (fr == null || fr.getPage() == null)
      return ;
   
    //记录已经访问过该url,下次不要重复访问它
    visitedUrls.add(nextUrl);
   
    //解析nextPage
    Task nextTask = new Task(nextUrl, task.url, task.site, 0);
    Model nextModel = new Model();
    List<Field> isAlsoParseInNextPageFields = target.getModel().getIsAlsoParseInNextPageFields();
    if (isAlsoParseInNextPageFields == null || isAlsoParseInNextPageFields.isEmpty())
      return ;
   
    nextModel.getField().addAll(isAlsoParseInNextPageFields);
    tgt.setModel(nextModel);
   
    ModelParser parser = new ModelParser(nextTask, tgt, listener);
    Page nextPageResult = fr.getPage();
    List<Map<String, Object>> nextMaps = parser.parse(nextPageResult);
    if (nextMaps == null)
      return ;
   
    for (Map<String, Object> nextMap : nextMaps){
      for (Iterator<Entry<String, Object>> it = nextMap.entrySet().iterator(); it.hasNext();){
        Entry<String, Object> e = it.next();
        String key = e.getKey();
        Object value = e.getValue();
        for (Map<String, Object> result : results){
          if (nextModel.isArrayField(key)){
            List<Object> list = (List<Object>) result.get(key);
            list.addAll((List<Object>)value);
          }else{
            StringBuilder sb = new StringBuilder();
            sb.append(result.get(key)).append(value);
View Full Code Here


      //用来记录分页里已经解析的url
      Set<String> visitedUrls = new HashSet<String>();
      visitedUrls.add(task.url);
     
      for (Rule r : rules.getRule()){
        Model digModel = r.getDigUrls();
        if (digModel == null)
          continue;
        if (!isDig)
          isDig = true;
       
View Full Code Here

  }
 
  //递归的额关键是 Page
  public void parseNextPage(Rule rule, Task task, Page page, Collection<String> urls, Set<String> visitedUrls, Map<String, Object> finalFields) throws Exception{
//    System.out.println("parse.next->"+page.getUrl());
    Model mdl = rule.getNextPage();
    if (mdl == null)
      return ;
   
    Target tgt = new Target();
    tgt.setName("dig_urls");
View Full Code Here

TOP

Related Classes of org.eweb4j.spiderman.xml.Model

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.