Package com.googlecode.flaxcrawler.download

Source Code of com.googlecode.flaxcrawler.download.DefaultDownloader

package com.googlecode.flaxcrawler.download;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.Proxy;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.log4j.Logger;
import com.googlecode.flaxcrawler.model.Page;
import java.io.FileNotFoundException;
import java.io.InputStream;
import org.apache.commons.lang.StringUtils;

/**
* Base class for all downloaders
* @author ameshkov
*/
public class DefaultDownloader implements Downloader {

    private Logger log = Logger.getLogger(this.getClass());
    private ProxyController proxyController;
    private int triesCount = 1;
    private long maxContentLength;
    private int connectionTimeout = 30000;
    private int readTimeout = 30000;
    private Map<String, String> headers;
    private String[] allowedContentTypes = new String[]{"text/html"};
    private String userAgent = "";
    private boolean keepAlive = false;
    private boolean headRequest = true;
    private long downloadRetryPeriod = 0;

    /**
     * Sets request headers
     * @param headers
     */
    public void setRequestHeaders(Map<String, String> headers) {
        this.headers = headers;
    }

    /**
     * Sets connection timeout. Default - 30000 ms.
     * @param connectionTimeout
     */
    public void setConnectionTimeout(int connectionTimeout) {
        this.connectionTimeout = connectionTimeout;
    }

    /**
     * Sets read timeout. Default - 30000 ms.
     * @param readTimeout
     */
    public void setReadTimeout(int readTimeout) {
        this.readTimeout = readTimeout;
    }

    /**
     * Sets maximum tries count
     * @param triesCount
     */
    public void setTriesCount(int triesCount) {
        this.triesCount = triesCount;
    }

    /**
     * Sets proxy controller
     * @param proxyController
     */
    public void setProxyController(ProxyController proxyController) {
        this.proxyController = proxyController;
    }

    /**
     * Sets maximum content length
     * @param maxContentLength
     */
    public void setMaxContentLength(long maxContentLength) {
        this.maxContentLength = maxContentLength;
    }

    /**
     * Sets allowed content types. By default text/html is allowed only.
     * @param allowedContentTypes
     */
    public void setAllowedContentTypes(String[] allowedContentTypes) {
        this.allowedContentTypes = allowedContentTypes;
    }

    /**
     * Sets bot user-agent header. Overrides request headers.
     * Default user-agent is FlaxCrawler/1.0
     * @param userAgent
     */
    public void setUserAgent(String userAgent) {
        this.userAgent = userAgent;
    }

    /**
     * Period before download tries (by default - 0)
     * @param period
     */
    public void setDownloadRetryPeriod(long downloadRetryPeriod) {
        this.downloadRetryPeriod = downloadRetryPeriod;
    }

    /**
     * Should downloader keep connection alive or not.<br/>
     * If you want HTTP keep-alive to be off you should:<br/>
     * Set system property to {@code http.keepAlive=false}.<br/>
     * Set downloader keepAlive property to false.<br/>
     * By default {@code keepAlive} property is {@code false}.
     * @param keepAlive
     */
    public void setKeepAlive(boolean keepAlive) {
        this.keepAlive = keepAlive;
    }

    /**
     * Should downloader first execute a HEAD request against specified URL or not
     * @param headRequest
     */
    public void setHeadRequest(boolean headRequest) {
        this.headRequest = headRequest;
    }

    /**
     * Returns downloader's logger
     * @return
     */
    protected Logger getLogger() {
        return log;
    }

    /**
     * Sets default headers for the specified request
     * @param request
     */
    protected void setDefaultHeaders(Request request) {
        request.addHeader("Pragma", "no-cache");
        request.addHeader("Cache-Control", "no-cache");
        request.addHeader("Accept-Encoding", "gzip,defalte");
        request.addHeader("Accept-Language", "ru,en-us;q=0.7,en;q=0.3");
    }

    /**
     * Creates request for this downloader
     * @param url
     * @return
     */
    protected Request createRequest(URL url) {
        // Creating request instance
        Request request = new Request(url);

        // Setting default headers
        setDefaultHeaders(request);
        // Sets user agent
        request.addHeader("User-Agent", userAgent);
        if (keepAlive) {
            // Sets keep-alive header
            request.addHeader("Connection", "keep-alive");
            request.addHeader("Keep-Alive", "300");
        } else {
            request.addHeader("Connection", "close");
        }

        // Setting custom headers
        if (headers != null) {
            request.getHeaders().putAll(headers);
        }
        return request;
    }

    /**
     * Downloads web page. Returns {@code null} if Page cannot be downloaded due to constraints (contentType, maxContentLength)
     * @param url
     * @return
     */
    public Page download(URL url) throws DownloadException {
        Request request = createRequest(url);

        Page page = null;

        for (int i = 0; i < triesCount; i++) {
            try {
                log.debug("Downloading from " + url + ", try number " + (i + 1));

                // Using proxy
                Proxy proxy = proxyController == null ? null : proxyController.getProxy();

                // If head request is needed - executing and checking constraints
                if (headRequest) {
                    // Sending HEAD request using specified proxy
                    Page headPage = headRequest(request, proxy);

                    // This is a redirect
                    if (headPage != null && headPage.getResponseCode() >= 300 && headPage.getResponseCode() < 400) {
                        log.debug("Server redirected our request to " + url);
                        // No need to request it again
                        return headPage;
                    }

                    // Error response code
                    if (headPage.getResponseCode() >= 400) {
                        log.debug("Cannot download " + request.getUrl() + (proxy == null ? "" : " through proxy " + proxy) + ", response code is " + headPage.getResponseCode());
                        if (i == (triesCount - 1)) {
                            return headPage;
                        } else {
                            waitForRetry(request);
                            continue;
                        }
                    }

                    // Checking constrains (content length, content type, etc)
                    if (!checkConstaints(headPage)) {
                        log.info("Request to " + request.getUrl() + " violates this downloader constraints");
                        return null;
                    }
                }

                // Downloading using the same proxy
                page = download(request, proxy);

                if (page.getResponseCode() < 400) {
                    // There was no error, returning page
                    return page;
                } else {
                    log.info("Return code for " + page.getUrl() + " is " + page.getResponseCode());
                }
            } catch (DownloadException ex) {
                log.info("DownloadException while downloading from " + request.getUrl() + ": " + ex.getMessage() + ", try number " + i);
            }

            waitForRetry(request);
        }

        return page;
    }

    private void waitForRetry(Request request) {
        if (downloadRetryPeriod > 0) {
            try {
                log.info("Waiting for retry period of " + downloadRetryPeriod + " ms for request " + request.getUrl());
                Thread.sleep(downloadRetryPeriod);
            } catch (InterruptedException ex) {
                log.error("Error while sleeping for the retry period", ex);
            }
        }
    }

    /**
     * Checks downloader constraints. Returns true is everything is OK.
     * @param page
     * @return
     */
    protected boolean checkConstaints(Page page) {
        log.debug("Checking constraints for page " + page.getUrl());

        int contentLength = NumberUtils.toInt(page.getHeader("content-length"));

        // If "Content-Length" header is specified - checking maxContentLength
        if (maxContentLength > 0 && contentLength > 0 && contentLength > maxContentLength) {
            log.info(page.getUrl() + " content length exceeded limit, stopping downloading");
            return false;
        }

        String contentType = page.getHeader("content-type");

        // If "Content-Type" header is not specified - something is definitely wrong
        if (contentType == null) {
            log.info(page.getUrl() + " content type is not specified, stopping downloading");
            return false;
        }

        if (allowedContentTypes == null) {
            return true;
        } else {
            for (String allowedContentType : allowedContentTypes) {
                if (contentType.startsWith(allowedContentType)) {
                    return true;
                }
            }
        }

        log.info(page.getUrl() + " content type (" + contentType + ") is not allowed");
        return false;
    }

    /**
     * Creates connection for the specified request
     * @param request
     * @return
     */
    protected HttpURLConnection createConnection(Request request, Proxy proxy) throws IOException {
        log.debug("Opening connection to " + request.getUrl() + (proxy == null ? " not using proxy " : " using proxy " + proxy));

        // Openging connection
        HttpURLConnection connection = (HttpURLConnection) (proxy == null ? request.getUrl().openConnection() : request.getUrl().openConnection(proxy));
        connection.setInstanceFollowRedirects(false);
        connection.setConnectTimeout(connectionTimeout);
        connection.setReadTimeout(readTimeout);

        // Setting request headers
        for (Map.Entry<String, String> entry : request.getHeaders().entrySet()) {
            log.debug("Setting request header " + entry.getKey() + "=" + entry.getValue());
            connection.setRequestProperty(entry.getKey(), entry.getValue());
        }

        return connection;
    }

    /**
     * Makes HEAD request and returns Response headers. Throws {@link DownloadException} if there's any exception downloading
     * this page. Usually it means that proxy is now dead.
     * @param request
     * @param proxy
     * @return {@link Page} object without content but with response code and headers
     */
    protected Page headRequest(Request request, Proxy proxy) throws DownloadException {
        long startTime = System.currentTimeMillis();

        HttpURLConnection connection = null;
        String connectionHeader = null;
        InputStream inputStream = null;

        try {
            connection = createConnection(request, proxy);
            connection.setRequestMethod("HEAD");
            connection.connect();

            // Setting response properties
            int responseCode = connection.getResponseCode();

            // Setting response headers
            Map<String, String> responseHeaders = new HashMap<String, String>();
            for (String header : connection.getHeaderFields().keySet()) {
                String headerValue = connection.getHeaderField(header);
                log.debug("Response header for " + request.getUrl() + " " + header + "=" + headerValue);
                responseHeaders.put(header == null ? null : header.toLowerCase(), headerValue);
            }

            // Getting Connection header from response
            connectionHeader = StringUtils.lowerCase(responseHeaders.get("connection"));

            // Reading input stream
            inputStream = connection.getInputStream();
            if (inputStream != null) {
                byte[] body = IOUtils.toByteArray(inputStream);
                log.debug("Head response body length is " + (body == null ? 0 : body.length));
            }

            long responseTime = System.currentTimeMillis() - startTime;
            return createPage(request, null, responseCode, responseHeaders, null, responseTime);
        } catch (IOException ex) {
            String message = "Error while processing HEAD request to " + request.getUrl() + (proxy == null ? " not using proxy" : " using proxy " + proxy);
            log.info(message, ex);
            throw new DownloadException(message, ex);
        } finally {
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException ex) {
                    log.warn("Error closing stream after HEAD request", ex);
                }
            }

            cleanUpConnection(connection, connectionHeader);
        }
    }

    /**
     * Collects response headers from the open connection
     * @param connection
     * @return
     */
    protected Map<String, String> getResponseHeaders(HttpURLConnection connection) {
        Map<String, String> responseHeaders = new HashMap<String, String>();
        // Setting response headers
        for (String header : connection.getHeaderFields().keySet()) {
            String headerValue = connection.getHeaderField(header);
            log.debug("Response header for " + connection.getURL() + " " + header + "=" + headerValue);
            responseHeaders.put(header == null ? null : header.toLowerCase(), headerValue);
        }

        return responseHeaders;
    }

    /**
     * Returns response content
     * @param gzipEncoding
     * @param connection
     * @return
     */
    protected byte[] getContent(boolean gzipEncoding, HttpURLConnection connection) throws IOException {
        InputStream inputStream = connection.getInputStream();
        byte[] content = null;

        try {
            // Reading response content
            if (gzipEncoding) {
                // Content is gzipped
                content = IOUtils.toByteArray(new GZIPInputStream(inputStream));
            } else {
                // Content is plain
                content = IOUtils.toByteArray(inputStream);
            }
        } finally {
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException ex) {
                    log.warn("Error closing input stream for request " + connection.getURL(), ex);
                }
            }
        }

        return content;
    }

    /**
     * Clean up connection. Reads errorStream and closes it, disconnects if needed.
     * @param connection
     * @param connectionHeader
     */
    protected void cleanUpConnection(HttpURLConnection connection, String connectionHeader) {
        // Handling error stream
        InputStream errorStream = connection.getErrorStream();
        try {
            if (errorStream != null) {
                String errorMessage = IOUtils.toString(errorStream);
                log.warn("Server sent an error message for connection " + connection.getURL() + ":\r\n" + errorMessage);
            }
        } catch (IOException ex) {
            log.warn("Exception while processing error stream for connection to " + connection.getURL(), ex);
        } finally {
            if (errorStream != null) {
                try {
                    errorStream.close();
                } catch (IOException ex) {
                    log.warn("Error closing error stream for connection to " + connection.getURL(), ex);
                }
            }
        }

        log.debug("Disconnecting connection  after request to " + connection.getURL());
        connection.disconnect();
    }

    /**
     * Downloads page content
     * @param request
     * @param httpMethod
     * @param useProxy
     * @return
     */
    protected Page download(Request request, Proxy proxy) throws DownloadException {
        HttpURLConnection connection = null;
        byte[] content = null;
        Map<String, String> responseHeaders = new HashMap<String, String>();
        int responseCode = 0;
        String encoding = null;
        String connectionHeader = null;
        long startTime = System.currentTimeMillis();

        try {
            // Creating connection
            connection = createConnection(request, proxy);
            connection.setRequestMethod("GET");
            connection.connect();

            // Setting response properties
            responseCode = connection.getResponseCode();

            // Setting response headers
            responseHeaders = getResponseHeaders(connection);

            // Getting response header "Connection" value
            connectionHeader = responseHeaders.get("connection");

            // Getting content type
            // First checking content encoding header
            String contentEncoding = responseHeaders.get("content-encoding");
            content = getContent(contentEncoding != null && "gzip".equals(contentEncoding), connection);

            if (content == null) {
                throw new DownloadException("Content is empty for " + request.getUrl() + " downloaded through proxy " + proxy);
            }

            // Trying to get charset from the "Content-Type" header
            encoding = getCharset(responseHeaders.get("content-type"));

            if (encoding == null) {
                // Trying to get charset from meta tag
                encoding = getCharsetFromMeta(content);
            }

            if (encoding == null) {
                //set by default
                encoding = "UTF-8";
            }
        } catch (SocketTimeoutException ex) {
            log.warn("Timeout exception for url " + request.getUrl() + (proxy == null ? " not using proxy" : " using proxy " + proxy));
            // Setting response code to 408
            responseCode = HttpURLConnection.HTTP_CLIENT_TIMEOUT;
        } catch (FileNotFoundException ex) {
            log.warn("FileNotFoundException for url " + request.getUrl() + (proxy == null ? " not using proxy" : " using proxy " + proxy), ex);
            // Setting response code to 404
            responseCode = HttpURLConnection.HTTP_NOT_FOUND;
        } catch (IOException ex) {
            log.warn("Error while requesting url " + request.getUrl() + (proxy == null ? " not using proxy" : " using proxy " + proxy), ex);
            // Setting response code to 503
            responseCode = HttpURLConnection.HTTP_UNAVAILABLE;
        } finally {
            cleanUpConnection(connection, connectionHeader);
        }

        long responseTime = System.currentTimeMillis() - startTime;
        return createPage(request, content, responseCode, responseHeaders, encoding, responseTime);
    }

    /**
     * Creates {@link Page} instance
     * @param request
     * @param content
     * @param responseCode
     * @param responseHeaders
     * @param encoding
     * @param responseTime
     * @return
     */
    protected Page createPage(Request request, byte[] content, int responseCode, Map<String, String> responseHeaders, String encoding, long responseTime) {
        log.debug("Response code from " + request.getUrl() + " is " + responseCode);
        Page page = new Page(request.getUrl(), responseHeaders, responseCode, encoding, responseTime, content);
        return page;
    }

    /**
     * Tries to get encoding. First from the "Content-Type" header, then tries to guess it from the content
     * @param content
     * @param contentType
     * @return
     */
    protected String getCharset(String contentType) {
        String charset = null;

        // Parsing Content-Type header first
        if (contentType != null) {
            String[] parts = contentType.split(";");

            for (int i = 1; i < parts.length && charset == null; i++) {
                final String t = parts[i].trim();
                final int index = t.toLowerCase().indexOf("charset=");
                if (index != -1) {
                    // Encoding found successfully, returning
                    charset = t.substring(index + 8);
                    charset = StringUtils.split(charset, ",;")[0];
                    return charset;
                }
            }
        }

        return charset;
    }

    /**
     * Tries to get charset from {@code meta} tag.
     * Very simple implementation.
     * @param content
     * @return
     */
    protected String getCharsetFromMeta(byte[] content) {
        try {
            String utf8string = new String(content, "UTF-8");

            Pattern metaRegexp = Pattern.compile("<meta\\s*[^>]*\\s*content=(\"|')?text/html;\\s+charset=([^\"';]+)(\"|'|;)?[^>]*>", Pattern.CASE_INSENSITIVE);
            Matcher matcher = metaRegexp.matcher(utf8string);

            if (matcher.find()) {
                return matcher.group(2);
            } else {
                return null;
            }
        } catch (Exception ex) {
            return null;
        }
    }
}
TOP

Related Classes of com.googlecode.flaxcrawler.download.DefaultDownloader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.