Package edu.uci.ics.crawler4j.fetcher

Source Code of edu.uci.ics.crawler4j.fetcher.PageFetcher

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package edu.uci.ics.crawler4j.fetcher;

import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.zip.GZIPInputStream;

import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpException;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpResponseInterceptor;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.HttpEntityWrapper;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParamBean;
import org.apache.http.protocol.HttpContext;
import org.apache.log4j.Logger;

import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;

/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class PageFetcher extends Configurable {

  protected static final Logger logger = Logger.getLogger(PageFetcher.class);

  protected ThreadSafeClientConnManager connectionManager;

  protected DefaultHttpClient httpClient;

  protected final Object mutex = new Object();

  protected long lastFetchTime = 0;

  protected IdleConnectionMonitorThread connectionMonitorThread = null;

  public PageFetcher(CrawlConfig config) {
    super(config);

    HttpParams params = new BasicHttpParams();
    HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
    paramsBean.setVersion(HttpVersion.HTTP_1_1);
    paramsBean.setContentCharset("UTF-8");
    paramsBean.setUseExpectContinue(false);

    params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString());
    params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout());
    params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout());

    params.setBooleanParameter("http.protocol.handle-redirects", false);

    SchemeRegistry schemeRegistry = new SchemeRegistry();
    schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));

    if (config.isIncludeHttpsPages()) {
      schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
    }

    connectionManager = new ThreadSafeClientConnManager(schemeRegistry);
    connectionManager.setMaxTotal(config.getMaxTotalConnections());
    connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost());
    httpClient = new DefaultHttpClient(connectionManager, params);

    if (config.getProxyHost() != null) {

      if (config.getProxyUsername() != null) {
        httpClient.getCredentialsProvider().setCredentials(
            new AuthScope(config.getProxyHost(), config.getProxyPort()),
            new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword()));
      }

      HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort());
      httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
        }

        httpClient.addResponseInterceptor(new HttpResponseInterceptor() {

            @Override
            public void process(final HttpResponse response, final HttpContext context) throws HttpException,
                    IOException {
                HttpEntity entity = response.getEntity();
                Header contentEncoding = entity.getContentEncoding();
                if (contentEncoding != null) {
                    HeaderElement[] codecs = contentEncoding.getElements();
                    for (HeaderElement codec : codecs) {
                        if (codec.getName().equalsIgnoreCase("gzip")) {
                            response.setEntity(new GzipDecompressingEntity(response.getEntity()));
                            return;
                        }
                    }
                }
            }

        });

    if (connectionMonitorThread == null) {
      connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager);
    }
    connectionMonitorThread.start();

  }

  public PageFetchResult fetchHeader(WebURL webUrl) {
    PageFetchResult fetchResult = new PageFetchResult();
    String toFetchURL = webUrl.getURL();
    HttpGet get = null;
    try {
      get = new HttpGet(toFetchURL);
      synchronized (mutex) {
        long now = (new Date()).getTime();
        if (now - lastFetchTime < config.getPolitenessDelay()) {
          Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
        }
        lastFetchTime = (new Date()).getTime();
      }
      get.addHeader("Accept-Encoding", "gzip");
      HttpResponse response = httpClient.execute(get);
      fetchResult.setEntity(response.getEntity());

      int statusCode = response.getStatusLine().getStatusCode();
      if (statusCode != HttpStatus.SC_OK) {
        if (statusCode != HttpStatus.SC_NOT_FOUND) {
          if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
            Header header = response.getFirstHeader("Location");
            if (header != null) {
              String movedToUrl = header.getValue();
              movedToUrl = URLCanonicalizer.getCanonicalURL(movedToUrl, toFetchURL);
              fetchResult.setMovedToUrl(movedToUrl);
            }
            fetchResult.setStatusCode(statusCode);
            return fetchResult;
          }
          logger.info("Failed: " + response.getStatusLine().toString() + ", while fetching " + toFetchURL);
        }
        fetchResult.setStatusCode(response.getStatusLine().getStatusCode());
        return fetchResult;
      }

      fetchResult.setFetchedUrl(toFetchURL);
      String uri = get.getURI().toString();
      if (!uri.equals(toFetchURL)) {
        if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
          fetchResult.setFetchedUrl(uri);
        }
      }

      if (fetchResult.getEntity() != null) {
        long size = fetchResult.getEntity().getContentLength();
        if (size == -1) {
          Header length = response.getLastHeader("Content-Length");
          if (length == null) {
            length = response.getLastHeader("Content-length");
          }
          if (length != null) {
            size = Integer.parseInt(length.getValue());
          } else {
            size = -1;
          }
        }
        if (size > config.getMaxDownloadSize()) {
          fetchResult.setStatusCode(CustomFetchStatus.PageTooBig);
          return fetchResult;
        }

        fetchResult.setStatusCode(HttpStatus.SC_OK);
        return fetchResult;

      } else {
        get.abort();
      }
    } catch (IOException e) {
      logger.error("Fatal transport error: " + e.getMessage() + " while fetching " + toFetchURL
          + " (link found in doc #" + webUrl.getParentDocid() + ")");
      fetchResult.setStatusCode(CustomFetchStatus.FatalTransportError);
      return fetchResult;
    } catch (IllegalStateException e) {
      // ignoring exceptions that occur because of not registering https
      // and other schemes
    } catch (Exception e) {
      if (e.getMessage() == null) {
        logger.error("Error while fetching " + webUrl.getURL());
      } else {
        logger.error(e.getMessage() + " while fetching " + webUrl.getURL());
      }
    } finally {
      try {
        if (fetchResult.getEntity() == null && get != null) {
          get.abort();
        }
      } catch (Exception e) {
        e.printStackTrace();
      }
    }
    fetchResult.setStatusCode(CustomFetchStatus.UnknownError);
    return fetchResult;
  }

  public synchronized void shutDown() {
    if (connectionMonitorThread != null) {
      connectionManager.shutdown();
      connectionMonitorThread.shutdown();
    }
  }
 
  public HttpClient getHttpClient() {
    return httpClient;
  }

  private static class GzipDecompressingEntity extends HttpEntityWrapper {

    public GzipDecompressingEntity(final HttpEntity entity) {
      super(entity);
    }

    @Override
    public InputStream getContent() throws IOException, IllegalStateException {

      // the wrapped entity's getContent() decides about repeatability
      InputStream wrappedin = wrappedEntity.getContent();

      return new GZIPInputStream(wrappedin);
    }

    @Override
    public long getContentLength() {
      // length of ungzipped content is not known
      return -1;
    }

  }
}
TOP

Related Classes of edu.uci.ics.crawler4j.fetcher.PageFetcher

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.