Package com.zhangwoo.spider.client.process

Source Code of com.zhangwoo.spider.client.process.XmlAnalyserTemplate

package com.zhangwoo.spider.client.process;

import java.util.ArrayList;
import java.util.List;

import javax.script.Invocable;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import com.meiya.common.DateUtil;
import com.meiya.common.XmlUtil;
import com.meiya.common.string.StringUtil;
import com.zhangwoo.spider.po.Conversation;
import com.zhangwoo.spider.po.UrlRequest;

@SuppressWarnings("restriction")
public class XmlAnalyserTemplate implements AnalyserTemplate {
  XPath xpath = XPathFactory.newInstance().newXPath();
  ScriptEngineManager sem = new ScriptEngineManager();
  ScriptEngine engine = sem.getEngineByName("javascript");

  Node templateNode = null;
  UrlRequest urlReq = null;

  Logger logger = Logger.getLogger(this.getClass().getName());

  public XmlAnalyserTemplate(Node templateNode,UrlRequest urlReq) {
    this.templateNode = templateNode;
    this.urlReq = urlReq;
  }

  public boolean isMatchTemplate(UrlRequest urlReq, String html) {
    try {
      String include = xpath.evaluate("RULE/INDEXOF/text()",
          this.templateNode);
      String regexp = xpath.evaluate("RULE/REGEXP/text()",
          this.templateNode);
      String code = xpath.evaluate("RULE/CODE/text()", this.templateNode);

      // 三种方式任意一种不匹配则认定不适用,直接否决当前模版
      if (!StringUtil.isEmpty(include)) {
        if (urlReq.getUrl().indexOf(include) == -1)
          return false;
      }
      if (!StringUtil.isEmpty(regexp)) {
        if (StringUtil.matchWeak(urlReq.getUrl(), regexp) == null)
          return false;
      }
      if (!StringUtil.isEmpty(code)) {
        engine.eval(code);
        Invocable jsInvoke = (Invocable) engine;
        if (jsInvoke
            .invokeFunction("ismatch",
                new Object[] { urlReq.getUrl(), html })
            .toString().trim().equals("0"))
          return false; // js 代码 返回1 是,0否
      }
    } catch (Exception e) {
      logger.error(
          "isMatchTemplate domain " + urlReq.getUrl()
              + " error ,  template xml "
              + XmlUtil.printNodeXml(templateNode), e);
    }
    return true;
  }

  public List<Conversation> findConversations(UrlRequest urlReq, String html,
      Document docHtml) {
    try {
      Node analyserNode = (Node) xpath.evaluate("ANALYSER",
          this.templateNode, XPathConstants.NODE);
      if (analyserNode != null) {
        String anSave = xpath.evaluate("@save", analyserNode);
        String anXpath = xpath.evaluate("@xpath", analyserNode);
        String anRegexp = xpath.evaluate("@regexp", analyserNode);

        Conversation convsXpath = getXpaths(analyserNode);
        Conversation convsRegExp = getRegExps(analyserNode);
        List<Conversation> convsResults = new ArrayList<Conversation>();

        // reg(可能json) 与 xpath(html标签) 走完全不同的路线
        if (!StringUtil.isEmpty(anXpath)
            && StringUtil.isEmpty(anRegexp)) { // 纯粹XPATH,给出结果一定是NodeList
          NodeList cons = (NodeList) xpath.evaluate(anXpath, docHtml,
              XPathConstants.NODESET);
          Conversation conTemp = new Conversation();
          for (int consi = 0; consi < cons.getLength(); consi++) {
            conTemp = analyserConversation(convsXpath, convsRegExp,
                cons.item(consi), null);
            conTemp.setSaveable(anSave);
            if(StringUtil.isEmpty(conTemp.getSelfLink())){
              conTemp.setSelfLink(urlReq.getUrl());
            }
            conTemp.setTid(urlReq.getTask().getTid());
            convsResults.add(conTemp);
          }
        } else if (!StringUtil.isEmpty(anRegexp)) {
          // List<String[]> cons=StringUtil.matchAll(anRegexp, html);
        }

        return convsResults;
      }
    } catch (Exception e) {
      logger.error(
          "isMatchTemplate domain " + urlReq.getUrl()
              + " error ,  template xml "
              + XmlUtil.printNodeXml(templateNode), e);
    }
    return null;
  }

  public List<UrlRequest> findLinks(UrlRequest urlReq, String html,
      Document docHtml) {
    List<UrlRequest> links = new ArrayList<UrlRequest>();
    try {
      UrlRequest urlTemp = null;

      NodeList linksNode = (NodeList) xpath.evaluate("LINKS/LINK",
          this.templateNode, XPathConstants.NODESET);
      if (linksNode != null && linksNode.getLength() > 0) {
        for (int linki = 0; linki < linksNode.getLength(); linki++) {
          String linkRegExp = xpath.evaluate("@regexp",
              linksNode.item(linki));
          String linkStopRegExp = xpath.evaluate("@stopregexp",
              linksNode.item(linki));
          String linkStopXpath = xpath.evaluate("@stopxpath",
              linksNode.item(linki));

          List<String[]> linkUrls = StringUtil.matchAll(html, linkRegExp);
          List<String[]> linkStops = null;
          if(!StringUtil.isEmpty(linkStopRegExp))
            linkStops = StringUtil.matchAll(html, linkStopRegExp);
          else if(!StringUtil.isEmpty(linkStopXpath)){
            linkStops=new ArrayList<String[]>();
            linkStops.add(new String[]{"",xpath.evaluate(linkStopXpath, docHtml).trim()});
          }

          for (int i = 0; i < linkUrls.size(); i++) {
            String linkUrl = linkUrls.get(i)[1];
            String linkStop = null;
            if(linkStops!=null&&linkStops.size()>0){
              if(i>linkStops.size()-1)
                linkStop=linkStops.get(linkStops.size()-1)[1];
              else
                linkStop=linkStops.get(i)[1];
            }
            if (!StringUtil.isEmpty(linkStop)
                && urlReq.getTask() != null
                && !StringUtil.isEmpty(urlReq.getTask()
                    .getUpdatetime())) {
              if (DateUtil.diffDate(DateUtil.SECOND, DateUtil
                  .stringToDate(urlReq.getTask()
                      .getUpdatetime()), DateUtil
                  .stringToDate(linkStop)) < 0) {
                continue;
              }
            }
            if (StringUtil.isEmpty(linkUrl))
              continue;
            urlTemp = (UrlRequest) urlReq.clone();
            urlTemp.setHeader(null);
            urlTemp.setUrl(SpiderThread.formatUrl(linkUrl, this.urlReq));
            links.add(urlTemp);
          }
        }
      }
    } catch (Exception e) {
      logger.error(
          "isMatchTemplate domain " + urlReq.getUrl()
              + " error ,  template xml "
              + XmlUtil.printNodeXml(templateNode), e);
    }
    return links;
  }
 

  private Conversation getRegExps(Node analyserNode)
      throws XPathExpressionException {
    Conversation convsRegExp = new Conversation();
    convsRegExp.setAuthor(xpath.evaluate("AUTHOR/@regexp", analyserNode));
    convsRegExp.setContent(xpath.evaluate("CONTENT/@regexp", analyserNode));
    convsRegExp.setMainLink(xpath
        .evaluate("MAINLINK/@regexp", analyserNode));
    convsRegExp.setPublishTime(xpath.evaluate("PUBLISHTIME/@regexp",
        analyserNode));
    convsRegExp.setSelfLink(xpath
        .evaluate("SELFLINK/@regexp", analyserNode));
    convsRegExp.setTitle(xpath.evaluate("TITLE/@regexp", analyserNode));
    convsRegExp.setUpdateTime(xpath.evaluate("UPDATETIME/@regexp",
        analyserNode));
    convsRegExp.setIsTopic(xpath.evaluate("ISTOPIC/@regexp", analyserNode));
    convsRegExp.setStopByExp(xpath.evaluate("SELFLINK/@stopByExp",
        analyserNode));
    return convsRegExp;
  }

  private Conversation getXpaths(Node analyserNode)
      throws XPathExpressionException {
    Conversation convsXpath = new Conversation();
    convsXpath.setAuthor(xpath.evaluate("AUTHOR/@xpath", analyserNode).trim());
    convsXpath.setContent(xpath.evaluate("CONTENT/@xpath", analyserNode).trim());
    convsXpath.setMainLink(xpath.evaluate("MAINLINK/@xpath", analyserNode).trim());
    convsXpath.setPublishTime(xpath.evaluate("PUBLISHTIME/@xpath",
        analyserNode).trim());
    convsXpath.setSelfLink(xpath.evaluate("SELFLINK/@xpath", analyserNode).trim());
    convsXpath.setTitle(xpath.evaluate("TITLE/@xpath", analyserNode).trim());
    convsXpath.setUpdateTime(xpath.evaluate("UPDATETIME/@xpath",
        analyserNode).trim());
    convsXpath.setIsTopic(xpath.evaluate("ISTOPIC/@xpath", analyserNode).trim());
    convsXpath.setStopByXpath(xpath.evaluate("SELFLINK/@stopByXpath",
        analyserNode).trim());
    convsXpath
        .setRunable(xpath.evaluate("SELFLINK/@runable", analyserNode).trim());
    return convsXpath;
  }

  /**
   * 根据模版的配置,
   *
   * @param analyserNode
   * @return
   * @throws XPathExpressionException
   */
  private Conversation analyserConversation(Conversation convsXpath,
      Conversation convsRegExp, Node node, String str)
      throws XPathExpressionException {
    Conversation resutl = new Conversation();

    if (node == null) {

    } else if (StringUtil.isEmpty(str)) {
      if (!StringUtil.isEmpty(convsXpath.getAuthor()))
        resutl.setAuthor(xpath.evaluate(convsXpath.getAuthor(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getContent()))
        resutl.setContent(xpath.evaluate(convsXpath.getContent(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getIsTopic()))
        resutl.setIsTopic(xpath.evaluate(convsXpath.getIsTopic(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getMainLink()))
        resutl.setMainLink(xpath.evaluate(convsXpath.getMainLink(),
            node).trim());
      if (!StringUtil.isEmpty(convsXpath.getPublishTime()))
        resutl.setPublishTime(xpath.evaluate(
            convsXpath.getPublishTime(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getRunable()))
        resutl.setRunable(xpath.evaluate(convsXpath.getRunable(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getSelfLink()))
        resutl.setSelfLink(SpiderThread.formatUrl(xpath.evaluate(convsXpath.getSelfLink(),
            node).trim(),this.urlReq));
      if (!StringUtil.isEmpty(convsXpath.getTitle()))
        resutl.setTitle(xpath.evaluate(convsXpath.getTitle(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getUpdateTime()))
        resutl.setUpdateTime(xpath.evaluate(convsXpath.getUpdateTime(),
            node).trim());
      if (!StringUtil.isEmpty(convsXpath.getRunable()))
        resutl.setRunable(xpath.evaluate(convsXpath.getRunable(), node).trim());
    }

    return resutl;
  }
}
TOP

Related Classes of com.zhangwoo.spider.client.process.XmlAnalyserTemplate

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.