Source Code of bixo.config.AdaptiveFetcherPolicy

/*
 * Copyright 2009-2013 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.config;


import bixo.fetcher.FetchRequest;


// TODO Re-work as FetchJobPolicy, once that supports the fetch portion of the job.


@SuppressWarnings("serial")
public class AdaptiveFetcherPolicy extends FetcherPolicy {
    private static final int MAX_REQUESTS_PER_CONNECTION = 100;
    
    // Interval between batched fetch requests, in milliseconds.
    private static final long DEFAULT_FETCH_INTERVAL = 5 * 60 * 1000L;
    


    public AdaptiveFetcherPolicy(long crawlEndTime, long crawlDelay) {
        super(DEFAULT_MIN_RESPONSE_RATE, DEFAULT_MAX_CONTENT_SIZE, crawlEndTime, crawlDelay, DEFAULT_MAX_REDIRECTS);
        
        if (crawlEndTime == FetcherPolicy.NO_CRAWL_END_TIME) {
            throw new IllegalArgumentException("crawlEndTime must be set");
        }
    }
    
    public AdaptiveFetcherPolicy(int minResponseRate, int maxContentSize, long crawlEndTime, long crawlDelay) {
        super(minResponseRate, maxContentSize, crawlEndTime, crawlDelay, DEFAULT_MAX_REDIRECTS);
    }
    
    @Override
    public int getMaxRequestsPerConnection() {
        return MAX_REQUESTS_PER_CONNECTION;
    }
    
    public FetchRequest getFetchRequest(long now, long crawlDelay, int maxUrls) {
        // we want to fetch maxUrls in the remaining time, but the min delay might constrain us.
        
        if ((getCrawlDelay() == 0) || (maxUrls == 0)) {
            return new FetchRequest(Math.min(maxUrls, getMaxRequestsPerConnection()), now);
        }


        // Even if we're at the end of the crawl, we still want to do our calculation using our
        // default fetch interval, to avoid not crawling anything if we run over. We rely on
        // an external mechanism to do any pruning of remaining URLs, so that they get properly
        // aborted.
        long fetchInterval = Math.max(DEFAULT_FETCH_INTERVAL, getCrawlEndTime() - now);
        
        // Crawl delay must be between the min crawl delay and the default crawl delay.
        long customCrawlDelay = Math.max(getCrawlDelay(), Math.min(DEFAULT_CRAWL_DELAY, fetchInterval / maxUrls));
        
        // Figure out how many URLs we can get in 5 minutes,or the remaining time (whatever is less).
        int numUrls = Math.min((int)(Math.min(DEFAULT_FETCH_INTERVAL, fetchInterval) / customCrawlDelay), maxUrls);
        long nextFetchTime = now + (numUrls * customCrawlDelay);
        return new FetchRequest(numUrls, nextFetchTime);
    }


}
Source Code of bixo.config.AdaptiveFetcherPolicy

Related Classes of bixo.config.AdaptiveFetcherPolicy