Package net.sf.jpluck.spider

Source Code of net.sf.jpluck.spider.HttpRetriever

package net.sf.jpluck.spider;

import net.sf.jpluck.ClientConfiguration;
import net.sf.jpluck.http.HttpClient;
import net.sf.jpluck.http.HttpResponse;
import net.sf.jpluck.http.TimeoutException;

import java.net.URI;


/**
* HTTP retriever that uses the java.net classes.
*/
class HttpRetriever extends Retriever {
  private String referrer;
  HttpRetriever(Spider spider, String uri, int level, boolean embedded, String referrer) {
    super(spider, uri, level, embedded);
    this.referrer=referrer;
  }

  public void run() {
    if (!spider.isRunning()) {
      return;
    }
    HttpClient httpClient = new HttpClient(ClientConfiguration.getDefault().getDownloadAttempts(),
                         spider.httpTimeout * 1000, spider.jxlDocument.getUserAgent(),
                         ClientConfiguration.getDefault().getAcceptCharset());
    httpClient.setReferrer(referrer);
    try {
      spider.fireRetrievalStarted(uri);

      HttpResponse cached = spider.retrieveFromCache(uri);
      long ifModifiedSince = 0;
      if (cached != null) {
        ifModifiedSince = cached.getDate();
      } else {
        if (Spider.isOffline()) {
          logger.warning("Could not retrieve " + uri + " from HTTP cache.");
          return;
        }
      }

      HttpResponse response;
      if (Spider.isOffline()) {
        response = cached;
        logger.info(response.getStatusCode() + " " + response.getStatusMessage() + ": " + uri);
      } else {
        response = httpClient.doGet(uri, ifModifiedSince, spider.cookieStore);
        logger.info(response.getStatusCode() + " " + response.getStatusMessage() + ": " + uri);
        if (response.getStatusCode() == 304) {
          response = cached;
        }
      }
      if (!spider.isRunning()) {
        return;
      }
      if (response.getStatusCode() == 200) {
        if (response != cached && !Spider.isOffline() && spider.jxlDocument.isUseHTTPCache()) {
          spider.storeInCache(uri, response);
        }

        String redirectionURI = null;
        if (response.isRedirected()) {
          redirectionURI = response.getRedirectionURL();
          logger.fine(uri + ": redirected to " + redirectionURI);
          if (spider.jxlDocument.getUri().equals(uri)) {
            // Correct the starting URI
            logger.fine("Resetting starting URI to " + redirectionURI);
            spider.jxlDocument.setStartingURI(URI.create(redirectionURI));
          }
        }

        ContentType contentType = new ContentType(response.getContentType());

        if (!spider.jxlDocument.isAutoDetectInputEncoding()) {
          contentType = contentType.derive(spider.jxlDocument.getInputEncoding());
        }

        Resource resource = new Resource(uri, redirectionURI, contentType, response.getContent(), level,
                         embedded);
        if (spider.jxlDocument.isAutoDetectInputEncoding()) {
          resource.scanForContentType();
        }
        spider.parse(resource);
      }
    } catch (TimeoutException e) {
      logger.warning("Timed out: " + uri);
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      spider.fireRetrievalCompleted(uri);
      spider.notifyTaskDone();
    }
  }
}
TOP

Related Classes of net.sf.jpluck.spider.HttpRetriever

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.