Package com.zhangwoo.spider.client.process

Source Code of com.zhangwoo.spider.client.process.SpiderThread

package com.zhangwoo.spider.client.process;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import com.meiya.common.FileUtil;
import com.meiya.common.XmlUtil;
import com.meiya.common.string.StringUtil;
import com.zhangwoo.spider.po.Conversation;
import com.zhangwoo.spider.po.UrlRequest;
import com.zhangwoo.spider.po.UrlState;

/**
* 本类主要职责为:
* <ol>
* <li>从服务端获取请求信息,即获得一个<code>UrlRequest</code></li>
* <li>发送该请求(此处有可能需要选择线路),获得对应源码</li>
* <li>读取解析模板配置,形成模板列表</li>
* <li>执行解析过程,并返回解析产物(包括实体与链接两种)</li>
* <li>日志记录</li>
* <ol>
*
* @author cchen
*
*/
public class SpiderThread extends Thread {
  XPath xpath = XPathFactory.newInstance().newXPath();
  static List<String> userAgents = new ArrayList<String>();
  private Logger logger = Logger.getLogger(SpiderThread.class.getName());

  static {
    userAgents = new ArrayList<String>();
    userAgents
        .add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11");
    userAgents
        .add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER");
    userAgents
        .add("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)");
    userAgents
        .add("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)");
    userAgents
        .add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER");
    userAgents
        .add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E) ");
    userAgents
        .add("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400) ");
    userAgents
        .add("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ");
    userAgents
        .add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)");
    userAgents
        .add("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ");
    userAgents
        .add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
    userAgents
        .add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1");
    userAgents
        .add("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1");
    userAgents
        .add("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ");
    userAgents
        .add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E) ");
    userAgents
        .add("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E) ");
    userAgents
        .add("Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0");
    userAgents
        .add("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) ");
  }

  @Override
  public void run() {
    codeProcess();
  }

  public void codeProcess() {
    // 获得request
    UrlRequest urlReq = ClientMessageCenter.getInstance().urlGet();
    if (urlReq == null || StringUtil.isEmpty(urlReq.getUrl()))
      return;

    // *+* 此处首先获取模板,若无模板则无需浪费时间与流量,直接放弃该链接。
    List<AnalyserTemplate> templates = null;
    try {
      templates = this.findAnalyserTemplate(urlReq);
    } catch (Exception e) {
      logger.error(e);
    }
    if (templates == null || templates.size() == 0)
      return;

    logger.info("running url : " + urlReq.getUrl());

    // 发送request
    String html = null;
    for (int i = 0; i < 3; i++) { // 最多发送3次请求
      try {
        html = sendRequest(urlReq);
        if (!StringUtil.isEmpty(html))
          break;
      } catch (Exception e) {
        logger.error("send request url : " + urlReq.getUrl()
            + " error! ", e);
      }
      try {
        Thread.sleep(1000);
      } catch (InterruptedException e) {
        logger.error("thread sleep error! ", e);
      }
    }

    List<Conversation> convsResult = new ArrayList<Conversation>();
    List<UrlRequest> urlReqResult = new ArrayList<UrlRequest>();

    if (!StringUtil.isEmpty(html)) { // html 若无内容  则无需分析
      // 一个个分析模板过程,单个页面可能分析出多组数据
      for (AnalyserTemplate template : templates) {
        if (!template.isMatchTemplate(urlReq, html))
          continue;

        Document docHtml = null;
        try {
          docHtml = XmlUtil.formatToDoc(html);
        } catch (Exception e) {
          logger.error("html fromat to doc error! " + html, e);
        }

        List<Conversation> convsTemp = template.findConversations(
            urlReq, html, docHtml);
        if (convsTemp != null)
          for (Conversation convs : convsTemp) {
            if (convs.getSaveable().equals("1")) {
              convsResult.add(convs);
            }
            if (convs.getRunable().equals("1"))
              urlReqResult.add(new UrlRequest(formatUrl(
                  convs.getSelfLink(), urlReq), urlReq
                  .getTask()));
          }
        urlReqResult.addAll(template.findLinks(urlReq, html, docHtml));
      }

      if (urlReqResult != null && urlReqResult.size() > 0) {
        ClientMessageCenter.getInstance().urlSend(urlReqResult);
        logger.info("url : " + urlReq.getUrl() + " get next url : "
            + urlReqResult.size());
      }
      if (convsResult != null && convsResult.size() > 0) {
        ClientMessageCenter.getInstance().resultSend(convsResult);
        logger.info("url : " + urlReq.getUrl() + " get convsResult : "
            + convsResult.size());
      }
    }
    if ((urlReqResult == null || urlReqResult.size() == 0) && (convsResult == null || convsResult.size() == 0)) {
      logger.info("url : " + urlReq.getUrl() + " get no reuslt!  ");
    }
    ClientMessageCenter.getInstance().stateSend(
        new UrlState(html.length(), urlReqResult.size(), convsResult
            .size(), urlReq));
  }

  public static String formatUrl(String selfUrl, UrlRequest urlReq) {
    if (selfUrl == null)
      return null;

    if (selfUrl.startsWith("http://") || selfUrl.startsWith("https://"))
      return selfUrl;

    String headUrl = null;
    int tmpStop = -1;
    if (selfUrl.startsWith("/")) {
      tmpStop = urlReq.getUrl().indexOf("/", 10);
    } else if (selfUrl.startsWith("?")) {
      // 添加参数的方式,需要移除#以及?等其他符号。
      tmpStop = urlReq.getUrl().indexOf("?");
      if(tmpStop>urlReq.getUrl().indexOf("#")){
        urlReq.getUrl().indexOf("#");
      }
      if(tmpStop<=0){tmpStop=urlReq.getUrl().length();}
    } else {
      tmpStop = urlReq.getUrl().lastIndexOf("/") + 1;
    }
    if (tmpStop == -1)
      tmpStop = urlReq.getUrl().length();
    headUrl = urlReq.getUrl().substring(0, tmpStop);
    selfUrl = headUrl + selfUrl;
    return selfUrl;
  }

  public static String sendRequest(UrlRequest urlReq) throws Exception {
//    Thread.sleep(3000);
   
    HttpClient httpclient = HttpClientBuilder.create().build();
    HttpResponse response = null;
    String html = null;

    if (urlReq.getMethod().toLowerCase().endsWith("get")) {
      HttpGet httpGet = new HttpGet(urlReq.getUrl());
      try {
        if (urlReq.getHeader() != null) {
          for (String name : urlReq.getHeader().keySet()) {
            httpGet.addHeader(name, urlReq.getHeader().get(name));
          }
        }
        // 若没有指定User-Agent则使用随机User-Agent
        if (urlReq.getHeader() == null
            || !urlReq.getHeader().containsKey("User-Agent")) {
          httpGet.addHeader("User-Agent", userAgents.get(new Random()
              .nextInt(userAgents.size())));
        }
        httpGet.addHeader("Connection", "close");

        response = httpclient.execute(httpGet);

        HttpEntity entity = response.getEntity();
        html = FileUtil.readStream(entity.getContent());

        EntityUtils.consume(entity);
      } catch (Exception e) {
        throw e;
      } finally {
        httpGet.releaseConnection();
      }
    } else {
      HttpPost httpPost = new HttpPost(urlReq.getUrl());
      try {
        for (String name : urlReq.getHeader().keySet()) {
          httpPost.addHeader(name, urlReq.getHeader().get(name));
        }

        List<NameValuePair> nvps = new ArrayList<NameValuePair>();
        nvps.add(new BasicNameValuePair("UserName", "cchen"));
        nvps.add(new BasicNameValuePair("password", "123456"));
        httpPost.setEntity(new UrlEncodedFormEntity(nvps, Consts.UTF_8));

        response = httpclient.execute(httpPost);

        HttpEntity entity = response.getEntity();
        html = FileUtil.readStream(entity.getContent());

        EntityUtils.consume(entity);
      } catch (Exception e) {
        throw e;
      } finally {
        httpPost.releaseConnection();
      }
    }
    return html;
  }

  private String getUrlDomain(UrlRequest urlReq) {
    return StringUtil.match(urlReq.getUrl(), "http://([^/]*)")[1];
  }

  /**
   * 构建所有解析模板,来源有两种(Xml配置与硬编码)
   *
   * @param urlRequest
   * @return
   * @throws XPathExpressionException
   * @throws SAXException
   * @throws IOException
   */
  private List<AnalyserTemplate> findAnalyserTemplate(UrlRequest urlReq)
      throws XPathExpressionException, IOException, SAXException {
    List<AnalyserTemplate> templates = new ArrayList<AnalyserTemplate>();

    Document doc = XmlUtil.formatToDoc(FileUtil.readStream(
        this.getClass().getResourceAsStream(
            "/plugins/" + getUrlDomain(urlReq) + ".xml"), "utf-8"));

    NodeList nodes = (NodeList) xpath.evaluate("//TEMPLATE", doc,
        XPathConstants.NODESET);
    for (int i = 0; i < nodes.getLength(); i++) {
      // 首先确定是硬编码(存在class属性)还是配置文件
      String classname = xpath.evaluate("@classname", nodes.item(i));

      try {
        if (!StringUtil.isEmpty(classname)) {
          templates.add((AnalyserTemplate) Class.forName(classname)
              .newInstance());
        } else {
          templates
              .add(new XmlAnalyserTemplate(nodes.item(i), urlReq));
        }
      } catch (Exception e) {
        logger.error("analyser template error!  \n node xml is : {}"
            + XmlUtil.printNodeXml(nodes.item(i)), e);
      }
    }

    return templates;
  }

  public static void main(String[] args) {
    new SpiderThread().start();
//    try {
//      String t=new SpiderThread().sendRequest(new UrlRequest("http://www.5nd.com/5nd_1.htm",null));
//      System.out.println(t);
//    } catch (Exception e) {
//      // TODO Auto-generated catch block
//      e.printStackTrace();
//    }
  }

}
TOP

Related Classes of com.zhangwoo.spider.client.process.SpiderThread

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.