Source Code of bixo.pipes.FetchPipe

/*
 * Copyright 2009-2013 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.pipes;


import java.net.MalformedURLException;
import java.security.InvalidParameterException;
import java.util.HashMap;
import java.util.Map;


import bixo.config.BaseFetchJobPolicy;
import bixo.config.BixoPlatform;
import bixo.config.DefaultFetchJobPolicy;
import bixo.datum.FetchSetDatum;
import bixo.datum.FetchedDatum;
import bixo.datum.GroupedUrlDatum;
import bixo.datum.ScoredUrlDatum;
import bixo.datum.StatusDatum;
import bixo.datum.UrlDatum;
import bixo.datum.UrlStatus;
import bixo.exceptions.BaseFetchException;
import bixo.fetcher.BaseFetcher;
import bixo.operations.BaseGroupGenerator;
import bixo.operations.BaseScoreGenerator;
import bixo.operations.FetchBuffer;
import bixo.operations.FilterAndScoreByUrlAndRobots;
import bixo.operations.GroupFunction;
import bixo.operations.MakeFetchSetsBuffer;
import bixo.robots.BaseRobotsParser;
import bixo.robots.RobotUtils;
import bixo.robots.SimpleRobotRulesParser;
import bixo.utils.GroupingKey;
import bixo.utils.UrlUtils;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;


import com.scaleunlimited.cascading.BaseSplitter;
import com.scaleunlimited.cascading.NullContext;
import com.scaleunlimited.cascading.NullSinkTap;
import com.scaleunlimited.cascading.SplitterAssembly;




@SuppressWarnings("serial")
public class FetchPipe extends SubAssembly {
        
    // Pipe that outputs FetchedDatum tuples, for URLs that were fetched.
    public static final String CONTENT_PIPE_NAME = "FetchPipe-content";
    
    // Pipe that outputs StatusDatum tuples, for all URLs being processed.
    public static final String STATUS_PIPE_NAME = "FetchPipe-status";
    
    /**
     * Generate key using protocol+host+port, which is what we need in order
     * to safely fetch robots.txt files.
     *
     */
    private static class GroupByDomain extends BaseGroupGenerator {
        
        @Override
        public String getGroupingKey(UrlDatum urlDatum) {
            String urlAsString = urlDatum.getUrl();
            
            try {
                return UrlUtils.makeProtocolAndDomain(urlAsString);
            } catch (MalformedURLException e) {
                throw new RuntimeException("Invalid URL: " + urlAsString);
            }
        }
    }


    private static class SplitIntoSpecialAndRegularKeys extends BaseSplitter {


        @Override
        public String getLHSName() {
            return "special grouping key";
        }


        @Override
        public boolean isLHS(TupleEntry tuple) {
            ScoredUrlDatum datum = new ScoredUrlDatum(tuple);
            return GroupingKey.isSpecialKey(datum.getGroupKey());
        }
    }
    
    private static class FilterErrorsFunction extends BaseOperation<NullContext> implements Function<NullContext> {
        private int _fieldPos;
        private int[] _fieldsToCopy;
        
        // Only output FetchedDatum tuples for input where we were able to fetch the URL.
        public FilterErrorsFunction() {
            super(FetchedDatum.FIELDS.size() + 1, FetchedDatum.FIELDS);
            int baseFieldCount = FetchedDatum.FIELDS.size();
            
            // Location of extra field added during fetch, that contains fetch error
            _fieldPos = baseFieldCount;
            
            // Create array used to extract the fields we need that correspond to
            // the FetchedDatum w/o the exception tacked on the end.
            _fieldsToCopy = new int[baseFieldCount];
            for (int i = 0; i < _fieldsToCopy.length; i++) {
                _fieldsToCopy[i] = i;
            }
        }


        @SuppressWarnings("rawtypes")
        @Override
        public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
            Tuple t = funcCall.getArguments().getTuple();
            
            // Get the status to decide if it's a good fetch
            Object status = t.getObject(_fieldPos);
            if ((status instanceof String) && (UrlStatus.valueOf((String)status) == UrlStatus.FETCHED)) {
                funcCall.getOutputCollector().add(BixoPlatform.clone(t.get(_fieldsToCopy), process));
            }
        }
    }


    private static class MakeStatusFunction extends BaseOperation<NullContext> implements Function<NullContext> {
        private int _fieldPos;
        
        // Output an appropriate StatusDatum based on whether we were able to fetch
        // the URL or not.
        public MakeStatusFunction() {
            super(StatusDatum.FIELDS);
            
            // Location of extra field added during fetch, that contains fetch status
            _fieldPos = FetchedDatum.FIELDS.size();
        }


        @SuppressWarnings("rawtypes")
        @Override
        public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
            TupleEntry entry = funcCall.getArguments();
            FetchedDatum fd = new FetchedDatum(entry);
            
            // Get the fetch status that we hang on the end of the tuple,
            // after all of the FetchedDatum fields.
            Object result = entry.getObject(_fieldPos);
            StatusDatum status;
            
            // Note: Here we share the payload of the FetchedDatum with the
            // StatusDatum we're about to emit, but since we let go after we
            // emit, there shouldn't be an issue with this sharing.
            if (result instanceof String) {
                UrlStatus urlStatus = UrlStatus.valueOf((String)result);
                if (urlStatus == UrlStatus.FETCHED) {
                    status = new StatusDatum(fd.getUrl(), fd.getHeaders(), fd.getHostAddress(), fd.getPayload());
                } else {
                    status = new StatusDatum(fd.getUrl(), urlStatus, fd.getPayload());
                }
            } else if (result instanceof BaseFetchException) {
                status = new StatusDatum(fd.getUrl(), (BaseFetchException)result, fd.getPayload());
            } else {
                throw new RuntimeException("Unknown type for fetch status field: " + result.getClass());
            }
            
            funcCall.getOutputCollector().add(BixoPlatform.clone(status.getTuple(), process));
        }
    }


    private static class MakeSkippedStatus extends BaseOperation<NullContext> implements Function<NullContext> {
        
        // Output an appropriate StatusDatum based on the grouping key (which must be special)
        public MakeSkippedStatus() {
            super(StatusDatum.FIELDS);
        }


        @SuppressWarnings("rawtypes")
        @Override
        public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
            ScoredUrlDatum sd = new ScoredUrlDatum(funcCall.getArguments());
            
            String key = sd.getGroupKey();
            if (!GroupingKey.isSpecialKey(key)) {
                throw new RuntimeException("Can't make skipped status for regular grouping key: " + key);
            }
            
            // Note: Here we share the payload of the ScoredUrlDatum with the
            // StatusDatum we're about to emit, but since we let go after we
            // emit, there shouldn't be an issue with this sharing.
            StatusDatum status = new StatusDatum(sd.getUrl(), GroupingKey.makeUrlStatusFromKey(key), sd.getPayload());
            status.setPayload(sd);
            
            funcCall.getOutputCollector().add(BixoPlatform.clone(status.getTuple(), process));
        }
    }


    /**
     * Generate an assembly that will fetch all of the UrlDatum tuples coming out of urlProvider.
     * 
     * We assume that these UrlDatums have been validated, and thus we'll only have valid URLs.
     * 
     * @param urlProvider
     * @param scorer
     * @param fetcher
     * @param numReducers
     */
    
    public FetchPipe(Pipe urlProvider, BaseScoreGenerator scorer, BaseFetcher fetcher, int numReducers) {
        this(urlProvider, scorer, fetcher, RobotUtils.createFetcher(fetcher),
                        new SimpleRobotRulesParser(),
                        new DefaultFetchJobPolicy(fetcher.getFetcherPolicy()),
                        numReducers);
    }
    
    public FetchPipe(Pipe urlProvider, BaseScoreGenerator scorer, BaseFetcher fetcher, BaseFetcher robotsFetcher, BaseRobotsParser parser,
                    BaseFetchJobPolicy fetchJobPolicy, int numReducers) {
        super(urlProvider);
        Pipe robotsPipe = new Each(urlProvider, new GroupFunction(new GroupByDomain()));
        robotsPipe = new GroupBy("Grouping URLs by IP/delay", robotsPipe, GroupedUrlDatum.getGroupingField());
        robotsPipe = new Every(robotsPipe, new FilterAndScoreByUrlAndRobots(robotsFetcher, parser, scorer), Fields.RESULTS);
        
        // Split into records for URLs that are special (not fetchable) and regular
        SplitterAssembly splitter = new SplitterAssembly(robotsPipe, new SplitIntoSpecialAndRegularKeys());
        
        // Now generate sets of URLs to fetch. We'll wind up with all URLs for the same server & the same crawl delay,
        // ordered by score, getting passed per list to the PreFetchBuffer. This will generate PreFetchDatums that contain a key
        // based on the hash of the IP address (with a range of values == number of reducers), plus a list of URLs and a target
        // crawl time.
        Pipe prefetchPipe = new GroupBy("Distributing URL sets", splitter.getRHSPipe(), GroupedUrlDatum.getGroupingField(), ScoredUrlDatum.getSortingField(), true);
        
        prefetchPipe = new Every(prefetchPipe, new MakeFetchSetsBuffer(fetchJobPolicy, numReducers), Fields.RESULTS);
        Pipe fetchPipe = new GroupBy("Fetching URL sets", prefetchPipe, FetchSetDatum.getGroupingField(), FetchSetDatum.getSortingField());
        fetchPipe = new Every(fetchPipe, new FetchBuffer(fetcher), Fields.RESULTS);


        Pipe fetchedContent = new Pipe(CONTENT_PIPE_NAME, new Each(fetchPipe, new FilterErrorsFunction()));
        
        Pipe fetchedStatus = new Pipe("fetched status", new Each(fetchPipe, new MakeStatusFunction()));
        
        // We need to merge URLs from the LHS of the splitter (never fetched) so that our status pipe
        // gets status for every URL we put into this sub-assembly.
        Pipe skippedStatus = new Pipe("skipped status", new Each(splitter.getLHSPipe(), new MakeSkippedStatus()));
        
        // TODO KKr You're already setting the group name here (so that the
        // tail pipe gets the same name), so I wasn't able to pass in a
        // group name here for BaseTool.nameFlowSteps to use for the job name.
        Pipe joinedStatus = new GroupBy(STATUS_PIPE_NAME, Pipe.pipes(skippedStatus, fetchedStatus), new Fields(StatusDatum.URL_FN));


        setTails(fetchedContent, joinedStatus);
    }


    public Pipe getContentTailPipe() {
        return getTailPipe(CONTENT_PIPE_NAME);
    }
    
    public Pipe getStatusTailPipe() {
        return getTailPipe(STATUS_PIPE_NAME);
    }
    
    private Pipe getTailPipe(String pipeName) {
        String[] pipeNames = getTailNames();
        for (int i = 0; i < pipeNames.length; i++) {
            if (pipeName.equals(pipeNames[i])) {
                return getTails()[i];
            }
        }
        
        throw new InvalidParameterException("Invalid pipe name: " + pipeName);
    }


    /**
     * Utility routine that helps create the Cascading map needed when there are
     * multiple tails (like with this subassembly) and you need to build the Flow
     * 
     * @param statusSink Tap where status will be sent (can be null)
     * @param fetchedSink Tap where fetched content will be sent (can be null)
     * @return Map usable in FlowConnector.connect() call.
     */
    @SuppressWarnings("rawtypes")
    public static Map<String, Tap> makeSinkMap(Tap statusSink, Tap fetchedSink) {
        HashMap<String, Tap> result = new HashMap<String, Tap>(2);
        
        if (statusSink == null) {
            statusSink = new NullSinkTap(StatusDatum.FIELDS);
        }
        
        if (fetchedSink == null) {
            fetchedSink = new NullSinkTap(FetchedDatum.FIELDS);
        }
        
        result.put(STATUS_PIPE_NAME, statusSink);
        result.put(CONTENT_PIPE_NAME, fetchedSink);
        
        return result;
    }
}
Source Code of bixo.pipes.FetchPipe

Related Classes of bixo.pipes.FetchPipe