Package org.exoplatform.test.crawler

Source Code of org.exoplatform.test.crawler.CrawlerService

/**
* Copyright (C) 2009 eXo Platform SAS.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/

package org.exoplatform.test.crawler;

import org.exoplatform.services.html.HTMLDocument;
import org.exoplatform.services.html.parser.HTMLParser;
import org.exoplatform.services.html.path.NodePath;
import org.exoplatform.services.html.path.NodePathParser;
import org.exoplatform.services.html.path.NodePathUtil;
import org.exoplatform.services.html.util.HyperLinkUtil;

import java.net.URL;
import java.util.List;

/**
* Created by The eXo Platform SARL
* Author : Lai Van Khoi
*          laivankhoi46pm1@yahoo.com
* Dec 1, 2006 
*/
public class CrawlerService extends Thread
{
   private SingleCrawlThread[] childrenThread;

   private boolean complete = false;

   private String url_;

   private String charset_ = "utf-8";

   private NodePath homePath_, childPath_; //Node.

   private List<String> links; //holds all links for downloading.

   private int idx = 0; //the order of the Thread (of URL-Link) is being downloaded.

   private HyperLinkUtil linkUtil = new HyperLinkUtil();

   //-------------------------
   public CrawlerService()
   {
      this.childrenThread = new SingleCrawlThread[3];
      for (int i = 0; i < this.childrenThread.length; i++)
      {
         this.childrenThread[i] = new SingleCrawlThread();
      }
      new Thread(this).start(); //Note.
   }

   //--------------------------
   public void startCrawl(String url, String charset, String homePath, String childPath) throws Exception
   {
      this.url_ = url;
      this.charset_ = charset;
      this.homePath_ = NodePathParser.toPath(homePath);
      this.childPath_ = NodePathParser.toPath(childPath);
      this.childrenThread[0].startDownload(this.url_);
   }

   //--------------------------
   public void run()
   {
      while (true)
      {
         try
         {
            this.processHome();
            this.processLink();
            Thread.sleep(2000);
         }
         catch (Exception exp)
         {
            exp.printStackTrace();
         }
      }
   }

   //-------------------------
   private void processHome() throws Exception
   {
      if (this.links != null && this.links.size() > 0)
         return; //don't hold any link for downloading.

      if (!this.childrenThread[0].isComplete())
         return; //Downloading hasn't still completed. 
      if (this.childrenThread == null)
         return;
      if (this.childrenThread[0] == null)
         return;
      if (this.childrenThread[0].getData() == null)
         return;

      byte[] data = this.childrenThread[0].getData().toByteArray();
      //The whole HTMLdocument.
      HTMLDocument document = HTMLParser.createDocument(data, this.charset_);
      //The(an) only part of HTMLDocument with 'homePath' Node (NodePath).
      document = NodePathUtil.create(document.getRoot(), new NodePath[]{this.homePath_});

      //Create the full url-link for all links to be downloaded.
      this.linkUtil.createFullNormalLink(document.getRoot(), new URL(this.url_));
      //And get all these url-links.
      this.links = this.linkUtil.getSiteLink(document.getRoot());
      this.idx = 0;
   }

   //--------------------------
   private void processLink() throws Exception
   {
      if (this.links == null || this.links.size() < 1)
         return; //Not download.
      if (this.idx >= this.links.size())
         return; //Downloading is already completed.

      //Check each Thread in the childrenThread.
      for (int i = 0; i < this.childrenThread.length; i++)
      {
         if (!this.childrenThread[i].isComplete())
            continue; //Continue when the Thread is still not completed.
         childrenThread[i].saveData();//Save data has downloaded when the Thread is completed.
         if (this.idx >= this.links.size())
         {
            System.out.println("Download completed!");
            this.complete = true;
            return;
         }
         this.childrenThread[i].startDownload(this.links.get(idx), this.idx, this.childPath_, this.charset_);
         this.idx++;
      }
   }

   //----------------------------
   public boolean isComplete()
   {
      return this.complete;
   }
}
TOP

Related Classes of org.exoplatform.test.crawler.CrawlerService

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.