Package ro.vivi.pistruiatul

Source Code of ro.vivi.pistruiatul.InternetsCrawler

package ro.vivi.pistruiatul;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.logging.Logger;

import HTTPClient.HTTPConnection;
import HTTPClient.HTTPResponse;
import HTTPClient.ModuleException;

/**
* Manages downloading of pages from the internet and maybe caching them right
* here, on disk.
*/
public class InternetsCrawler {
  static Logger log = Logger.getLogger("ro.vivi.pistruiatul.InternetsCrawler");

  public static boolean useCache = true;
  private static final String CACHE_DIR = "data.cached/";

  /**
   * A private constructor to make sure this remains a static class.
   */
  private InternetsCrawler() {

  }

  /**
   * Adds an url to the queue, without guaranteeing synchronicity. The consumer
   * handler will be called when the page comes.
   * @param host The host.
   * @param path The path.
   * @param consumer Who eventually consumes this page.
   */
  public static void enqueue(String host, String path, PageConsumer consumer) {
    consumer.consume(getPage(host, path, "ISO-8859-2"));
  }

  /**
   * Adds an url to the queue, without guaranteeing synchronicity. The consumer
   * handler will be called when the page comes.
   * @param host The host.
   * @param path The path.
   * @param consumer Who eventually consumes this page.
   * @param charset The charset under which to get the page.
   */
  public static void enqueue(String host, String path, PageConsumer consumer,
      String charset) {
    consumer.consume(getPage(host, path, charset));
  }

  /**
   * Does all the magic that the dumb HTTPClient needs to just get me the page
   * that I want, without me caring about all the exceptions.
   * @param host
   * @param path
   * @return
   */
  private static String getPage(String host, String path, String charset) {
    String cache = getFromCache(host, path);
    if (cache != null) {
      //log.info("Cache hit for " + host + path);
      return cache;
    }

    try {
      Thread.sleep(1000);
    } catch (Exception e) {};

    String data = "";

    try {
      HTTPConnection con = new HTTPConnection(host);
      HTTPResponse rsp = con.Get(path);
      // Call getText to block the connection until it gets the whole page.
      rsp.getText();
      byte[] bytes = rsp.getData();

      // Create decoder for ISO-8859-2
      Charset cset = Charset.forName(charset);
      CharsetDecoder decoder = cset.newDecoder();

      try {
        ByteBuffer bbuf = ByteBuffer.wrap(bytes);

        // Convert ISO-LATIN-2 bytes in a ByteBuffer to a character ByteBuffer
        // and then to a string. The new ByteBuffer is ready to be read.
        CharBuffer cbuf = decoder.decode(bbuf);
        data = cbuf.toString();

      } catch (CharacterCodingException e) {
        e.printStackTrace();
      }

    } catch (IOException ioe) {
      System.err.println(ioe.toString());
    } catch (ModuleException me) {
      System.err.println("Error handling request: " + me.getMessage());
    } catch (HTTPClient.ParseException e) {
      e.printStackTrace();
    }

    writeToCache(host, path, data);
    return data;
  }

  private static String getFromCache(String host, String path) {
    if (!useCache) return null;
    return getUtfStringFromDisk(CACHE_DIR + getHashKey(host, path));
  }

  public static String getUtfStringFromDisk(String fname) {
    try {
      File file = new File(fname);
      if (file.exists()) {
        BufferedInputStream in =
          new BufferedInputStream(new FileInputStream(file));

        byte[] data = new byte[in.available()];
        in.read(data);
        return new String(data, "UTF8");
      }
    } catch (FileNotFoundException fnfe) {
      fnfe.printStackTrace();
    } catch (IOException ioe) {
      ioe.printStackTrace();
    }
    return null;
  }


  /**
   * Given a URL and the data that we got from it, write it to a file somewhere
   * in a local cache. We will retrieve that data from there when we'll look
   * again at this url, if it did not expire.
   * @param host
   * @param path
   * @param data
   */
  private static void writeToCache(String host, String path, String data) {
    try {
      File file = new File(CACHE_DIR + getHashKey(host, path));
      FileOutputStream fos = new FileOutputStream(file);

      fos.write(data.getBytes("UTF8"));
    } catch (FileNotFoundException fnfe) {
      fnfe.printStackTrace();
    } catch (IOException ioe) {
      ioe.printStackTrace();
    }

  }

  /**
   * Get a fancy MD5 hash key for the url so that we make sure it doesn't
   * duplicate since the regular hash string is based on the first 32 characters
   * of one string.
   *
   * @param host
   * @param path
   * @return
   */
  private static String getHashKey(String host, String path) {
    String s = host + path;
    String hash = null;
    try {
      MessageDigest m = MessageDigest.getInstance("MD5");
      m.update(s.getBytes(), 0, s.length());
      hash = new BigInteger(1, m.digest()).toString();
    } catch (NoSuchAlgorithmException nsae) {
      nsae.printStackTrace();
    }
    return hash;
  }

}
TOP

Related Classes of ro.vivi.pistruiatul.InternetsCrawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.