Package spiderman.plugin.impl

Source Code of spiderman.plugin.impl.ParsePointImpl

package spiderman.plugin.impl;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.eweb4j.spiderman.fetcher.FetchRequest;
import org.eweb4j.spiderman.fetcher.FetchResult;
import org.eweb4j.spiderman.fetcher.Page;
import org.eweb4j.spiderman.plugin.ParsePoint;
import org.eweb4j.spiderman.spider.SpiderListener;
import org.eweb4j.spiderman.task.Task;
import org.eweb4j.spiderman.xml.Field;
import org.eweb4j.spiderman.xml.Model;
import org.eweb4j.spiderman.xml.Rule;
import org.eweb4j.spiderman.xml.Site;
import org.eweb4j.spiderman.xml.Target;

import spiderman.plugin.util.ModelParser;
import spiderman.plugin.util.UrlUtils;

public class ParsePointImpl implements ParsePoint{

//  private Task task;
  private SpiderListener listener;
//  private Target target ;
//  private Page page;
 
  public void init(Site site, SpiderListener listener) {
    this.listener = listener;
  }

  public void destroy() {
  }
 
//  public synchronized void context(Task task, Target target, Page page) throws Exception{
//    this.task = task;
//    this.target = target;
//    this.page = page;
//  }
 
  public List<Map<String, Object>> parse(Task task, Target target, Page page, List<Map<String, Object>> models) throws Exception {
    List<Map<String, Object>> results = new ModelParser(task, target, listener).parse(page);
   
    //用来记录分页里已经解析的url
    Set<String> visitedUrls = new HashSet<String>();
    visitedUrls.add(task.url);
   
    List<Rule> rules = target.getUrlRules().getRule();
    for (Rule rule : rules) {
      //顺序递归解析下一页的内容
      Map<String, Object> finalFields = new HashMap<String, Object>();
      parseNextPage(rule, target, task, page, results, visitedUrls, finalFields);
    }
    return results;
  }

  //递归的额关键是 Page
  public void parseNextPage(Rule rule, Target target, Task task, Page page, List<Map<String, Object>> results, Set<String> visitedUrls, Map<String,Object> finalFields) throws Exception{
//    System.out.println("parse.next->"+page.getUrl());
    Model mdl = rule.getNextPage();
    if (mdl == null)
      return ;
   
    Target tgt = new Target();
    tgt.setName(target.getName());
    tgt.setModel(mdl);
   
    //解析Model获得next URL
//    System.out.println("page--!!!!!!----->"+page.getUrl());
    Collection<String> nextUrls = UrlUtils.digUrls(page, task, rule, tgt, listener, finalFields);
//    System.out.println("visitedUrls-->>>>>>>>>>>>!!!!!!!!!!!!!!" + visitedUrls);
//    System.out.println("nextUrls-->>>>>>>>>>>>!!!!!!!!!!!!!!" + nextUrls);
    if (nextUrls == null || nextUrls.isEmpty())
      return ;
    String nextUrl = new ArrayList<String>(nextUrls).get(0);
    if (nextUrl == null || nextUrl.trim().length() == 0)
      return ;
   
    if (visitedUrls.contains(nextUrl)){
      return ;
    }
   
    FetchRequest req = new FetchRequest();
    req.setUrl(nextUrl);
    FetchResult fr = task.site.fetcher.fetch(req);
    if (fr == null || fr.getPage() == null)
      return ;
   
    //记录已经访问过该url,下次不要重复访问它
    visitedUrls.add(nextUrl);
   
    //解析nextPage
    Task nextTask = new Task(nextUrl, task.url, task.site, 0);
    Model nextModel = new Model();
    List<Field> isAlsoParseInNextPageFields = target.getModel().getIsAlsoParseInNextPageFields();
    if (isAlsoParseInNextPageFields == null || isAlsoParseInNextPageFields.isEmpty())
      return ;
   
    nextModel.getField().addAll(isAlsoParseInNextPageFields);
    tgt.setModel(nextModel);
   
    ModelParser parser = new ModelParser(nextTask, tgt, listener);
    Page nextPageResult = fr.getPage();
    List<Map<String, Object>> nextMaps = parser.parse(nextPageResult);
    if (nextMaps == null)
      return ;
   
    for (Map<String, Object> nextMap : nextMaps){
      for (Iterator<Entry<String, Object>> it = nextMap.entrySet().iterator(); it.hasNext();){
        Entry<String, Object> e = it.next();
        String key = e.getKey();
        Object value = e.getValue();
        for (Map<String, Object> result : results){
          if (nextModel.isArrayField(key)){
            List<Object> list = (List<Object>) result.get(key);
            list.addAll((List<Object>)value);
          }else{
            StringBuilder sb = new StringBuilder();
            sb.append(result.get(key)).append(value);
            result.put(key, sb.toString());
          }
        }
      }
    }
   
    parseNextPage(rule, target, nextTask, nextPageResult, results, visitedUrls, finalFields);
  }
}
TOP

Related Classes of spiderman.plugin.impl.ParsePointImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.