Package bixo.pipes

Source Code of bixo.pipes.AbstractFetchPipeTest

/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.pipes;

import java.io.IOException;
import java.io.OutputStream;

import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.http.HttpStatus;
import org.eclipse.jetty.http.HttpException;
import org.eclipse.jetty.server.Request;
import org.eclipse.jetty.server.Response;
import org.eclipse.jetty.server.handler.AbstractHandler;
import org.junit.Assert;

import bixo.config.BaseFetchJobPolicy;
import bixo.config.BixoPlatform;
import bixo.config.DefaultFetchJobPolicy;
import bixo.config.FetcherPolicy;
import bixo.config.FetcherPolicy.RedirectMode;
import bixo.datum.FetchedDatum;
import bixo.datum.HttpHeaders;
import bixo.datum.ScoredUrlDatum;
import bixo.datum.StatusDatum;
import bixo.datum.UrlDatum;
import bixo.datum.UrlStatus;
import bixo.exceptions.AbortedFetchException;
import bixo.exceptions.AbortedFetchReason;
import bixo.exceptions.BaseFetchException;
import bixo.exceptions.HttpFetchException;
import bixo.exceptions.IOFetchException;
import bixo.exceptions.RedirectFetchException;
import bixo.exceptions.UrlFetchException;
import bixo.fetcher.BaseFetcher;
import bixo.fetcher.RandomResponseHandler;
import bixo.fetcher.SimpleHttpFetcher;
import bixo.fetcher.simulation.FakeHttpFetcher;
import bixo.fetcher.simulation.TestWebServer;
import bixo.operations.BaseGroupGenerator;
import bixo.operations.BaseScoreGenerator;
import bixo.operations.FixedScoreGenerator;
import bixo.robots.BaseRobotsParser;
import bixo.robots.SimpleRobotRulesParser;
import bixo.utils.ConfigUtils;
import bixo.utils.GroupingKey;
import cascading.CascadingTestCase;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.flow.FlowDef;
import cascading.pipe.Pipe;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;

import com.scaleunlimited.cascading.BasePath;
import com.scaleunlimited.cascading.BasePlatform;
import com.scaleunlimited.cascading.NullSinkTap;
import com.scaleunlimited.cascading.Payload;


// Long-running test
@SuppressWarnings({ "serial", "rawtypes", "unchecked" })
public abstract class AbstractFetchPipeTest extends CascadingTestCase {
   
    private static final String BASE_INPUT_PATH = "build/test/FetchPipeTest/";
    private static final String BASE_OUTPUT_PATH = "build/test/FetchPipeTest/";

    private class RedirectResponseHandler extends AbstractHandler {
       
        public static final String REDIRECT_TARGET_URL = "http://localhost:8089/redirect";
        private boolean _permanent;
       
        public RedirectResponseHandler(boolean permanent) {
            super();
            _permanent = permanent;
        }
       
        @Override
        public void handle(String pathInContext, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws HttpException, IOException {
            if (_permanent) {
                // Can't use sendRedirect, as that forces it to be a temp redirect.
                if (response instanceof  Response) {
                    Response jettyResponse = (Response) response;
                    jettyResponse.setStatus(HttpStatus.SC_MOVED_PERMANENTLY);
                    jettyResponse.setHeader("Location", REDIRECT_TARGET_URL);
                }
                if (request instanceof Request) {
                    Request jettyRequest = (Request) request;
                    jettyRequest.setHandled(true);
                }
            } else {
                response.sendRedirect(REDIRECT_TARGET_URL);
            }
        }
    }

    protected void testHeadersInStatus(BasePlatform platform) throws Exception {
        Tap in = makeInputData(platform, "testHeadersInStatus", 1, 1);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
       
        BasePath outputPath = makeOutputPath(platform, "testHeadersInStatus");
        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
       
        // Finally we can run it.
        FlowDef flowDef = new FlowDef();
        flowDef.setName("testHeadersInStatus");
        flowDef.addSource(pipe, in);
        flowDef.addTailSink(fetchPipe.getStatusTailPipe(), status);
        flowDef.addTailSink(fetchPipe.getContentTailPipe(), new NullSinkTap());
       
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(flowDef);
        flow.writeDOT("build/test/FetchPipeLRTest/testHeadersInStatus/flow.dot");
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertTrue(tupleEntryIterator.hasNext());
        StatusDatum sd = new StatusDatum(tupleEntryIterator.next());
        Assert.assertEquals(UrlStatus.FETCHED, sd.getStatus());
        HttpHeaders headers = sd.getHeaders();
        Assert.assertNotNull(headers);
        Assert.assertTrue(headers.getNames().size() > 0);
    }
   
    protected void testFetchPipe(BixoPlatform platform) throws Exception {
        // System.setProperty("bixo.root.level", "TRACE");
        final int numPages = 10;
        final int port = 8089;
       
        Tap in = makeInputData(platform, "testFetchPipe", "localhost:" + port, numPages, new Payload());

        Pipe pipe = new Pipe("urlSource");
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseFetcher fetcher = new SimpleHttpFetcher(ConfigUtils.BIXO_TEST_AGENT);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);
       
        String output = "build/test/FetchPipeTest/testFetchPipe";
        BasePath outputPath = platform.makePath(output);
        BasePath statusPath = platform.makePath(outputPath, "status");
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        TestWebServer webServer = null;
       
        try {
            webServer = new TestWebServer(new NoRobotsResponseHandler(), port);
            flow.complete();
        } finally {
            webServer.stop();
        }
       
        // Verify numPages fetched and numPages status entries were saved.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
       
        int totalEntries = 0;
        boolean[] fetchedPages = new boolean[numPages];
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;

            // Verify we can convert properly
            FetchedDatum datum = new FetchedDatum(entry);
            String url = datum.getUrl();
            Assert.assertNotNull(url);
           
            // Verify that we got one of each page
            int idOffset = url.indexOf(".html") - 1;
            int pageId = Integer.parseInt(url.substring(idOffset, idOffset + 1));
            Assert.assertFalse(fetchedPages[pageId]);
            fetchedPages[pageId] = true;
        }
       
        Assert.assertEquals(numPages, totalEntries);
        tupleEntryIterator.close();
       
        validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        totalEntries = 0;
        fetchedPages = new boolean[numPages];
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;

            // Verify we can convert properly
            StatusDatum sd = new StatusDatum(entry);
            Assert.assertEquals(UrlStatus.FETCHED, sd.getStatus());
           
           
            // Verify that we got one of each page
            String url = sd.getUrl();
            Assert.assertNotNull(url);
            int idOffset = url.indexOf(".html") - 1;
            int pageId = Integer.parseInt(url.substring(idOffset, idOffset + 1));
            Assert.assertFalse(fetchedPages[pageId]);
            fetchedPages[pageId] = true;
        }
       
        Assert.assertEquals(numPages, totalEntries);
    }
   
    protected void testRedirectException(BixoPlatform platform) throws Exception {
        // System.setProperty("bixo.root.level", "TRACE");
       
        final int numPages = 1;
        final int port = 8089;
       
        Payload payload = new Payload();
        payload.put("payload-field-1", 1);
        Tap in = makeInputData(platform, "testRedirectException", "localhost:" + port, numPages, payload);

        Pipe pipe = new Pipe("urlSource");
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        FetcherPolicy policy = new FetcherPolicy();
        policy.setRedirectMode(RedirectMode.FOLLOW_TEMP);
        BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);
       
        String output = "build/test/FetchPipeTest/testRedirectException";
        BasePath outputPath = platform.makePath(output);
        BasePath statusPath = platform.makePath(outputPath, "status");
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        TestWebServer webServer = null;
       
        try {
            webServer = new TestWebServer(new RedirectResponseHandler(true), port);
            flow.complete();
        } finally {
            webServer.stop();
        }
       
        // Verify numPages fetched and numPages status entries were saved.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertFalse(tupleEntryIterator.hasNext());
        tupleEntryIterator.close();
       
        validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        int totalEntries = 0;
        boolean[] fetchedPages = new boolean[numPages];
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;

            // Verify we can convert properly
            StatusDatum sd = new StatusDatum(entry);
            Assert.assertTrue(sd.getException() instanceof RedirectFetchException);
            RedirectFetchException redirectException =
                (RedirectFetchException)(sd.getException());
            Assert.assertEquals(RedirectResponseHandler.REDIRECT_TARGET_URL,
                                redirectException.getRedirectedUrl());
            Assert.assertEquals(payload.get("payload-field-1"),
                                sd.getPayloadValue("payload-field-1"));
           
            // Verify that we got one of each page
            String url = sd.getUrl();
            Assert.assertNotNull(url);
            int idOffset = url.indexOf(".html") - 1;
            int pageId = Integer.parseInt(url.substring(idOffset, idOffset + 1));
            Assert.assertFalse(fetchedPages[pageId]);
            fetchedPages[pageId] = true;
        }
       
        Assert.assertEquals(numPages, totalEntries);
    }
   
    protected void testTerminatingFetchPipe(BixoPlatform platform) throws Exception {
        // System.setProperty("bixo.root.level", "TRACE");
       
        final int numPages = 10;
        final int port = 8089;
       
        Tap in = makeInputData(platform, "testTerminatingFetchPipe", "localhost:" + port, numPages, null);

        Pipe pipe = new Pipe("urlSource");
        BaseScoreGenerator scorer = new FixedScoreGenerator();
       
        FetcherPolicy policy = new FetcherPolicy();
        policy.setCrawlEndTime(System.currentTimeMillis() + 50000);
        // Assume we should only need 10ms for fetching all 10 URLs.
        policy.setRequestTimeout(10);
       
        BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);
       
        BasePath outputPath = makeOutputPath(platform, "testTerminatingFetchPipe");
        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, status, fetchPipe.getStatusTailPipe());
        TestWebServer webServer = null;
       
        try {
            final int numBytes = 10000;
           
            // Pick a time way longer than the FetcherPolicy.getRequestTimeout().
            final long numMilliseconds = 100 * 1000L;
            webServer = new TestWebServer(new NoRobotsResponseHandler(numBytes, numMilliseconds), port);
            flow.complete();
        } finally {
            webServer.stop();
        }
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        int totalEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;

            // Verify we can convert properly
            StatusDatum sd = new StatusDatum(entry);
            Assert.assertEquals(UrlStatus.SKIPPED_INTERRUPTED, sd.getStatus());
        }
       

        // TODO CSc - re-enable this test, when termination really works.
        // Assert.assertEquals(numPages, totalEntries);
    }
   
    protected void testPayloads(BixoPlatform platform) throws Exception {
        Payload payload = new Payload();
        payload.put("key", "value");
        Tap in = makeInputData(platform, "testPayloads", 1, 1, payload);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 10);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
        String output = "build/test/FetchPipeTest/dual";
        BasePath outputPath = platform.makePath(output);
        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
       
        int totalEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;
           
            FetchedDatum datum = new FetchedDatum(entry);
            String payloadValue = (String)datum.getPayloadValue("key");
            Assert.assertNotNull(payloadValue);
            Assert.assertEquals("value", payloadValue);
        }
       
        Assert.assertEquals(1, totalEntries);
        tupleEntryIterator.close();
       
        validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        totalEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;

            // Verify we can convert properly
            StatusDatum sd = new StatusDatum(entry);
            Assert.assertEquals(UrlStatus.FETCHED, sd.getStatus());
            String payloadValue = (String)sd.getPayloadValue("key");
            Assert.assertNotNull(payloadValue);
            Assert.assertEquals("value", payloadValue);
        }
       
        Assert.assertEquals(1, totalEntries);
    }
   
    protected void testSkippingURLsByScore(BixoPlatform platform) throws Exception {
        // Create four pages, for domain0/page0, domain0/page1, domain1/page0, domain1/page1
        Tap in = makeInputData(platform, "testSkippingURLsByScore", 2, 2);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1);
        BaseScoreGenerator scorer = new SkippedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
        BasePath outputPath = makeOutputPath(platform, "testSkippingURLsByScore");
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);
       
        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(null, content), fetchPipe);
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertTrue(tupleEntryIterator.hasNext());
        TupleEntry te = tupleEntryIterator.next();
        String url = te.getString(FetchedDatum.URL_FN);
        Assert.assertTrue(url.contains("bixo-test-domain-1.com/page-1.html"));
       
        // Should only be one resulting page (for domain 1, page 1).
        Assert.assertFalse(tupleEntryIterator.hasNext());
    }
   
    private static class SkippedScoreGenerator extends BaseScoreGenerator {

        /* Skip everything from the first domain, and then conditionally skip
         * URLs based on pattern, so some are rejected and some aren't.
         * (non-Javadoc)
         * @see bixo.operations.BaseScoreGenerator#generateScore(java.lang.String, java.lang.String, java.lang.String)
         */
        @Override
        public double generateScore(String domain, String pld, String url) {
            if (domain.equals("bixo-test-domain-0.com")) {
                return BaseScoreGenerator.SKIP_SCORE;
            } else if (url == null) {
                return 1.0;
            } else if (url.contains("page-0.html")) {
                return BaseScoreGenerator.SKIP_SCORE;
            } else {
                return 1.0;
            }
        }
    }
   
    protected void testDurationLimitSimple(BixoPlatform platform) throws Exception {
        // Pretend like we have 10 URLs from the same domain
        Tap in = makeInputData(platform, "testDurationLimitSimple", 1, 10);

        // Create the fetch pipe we'll use to process these fake URLs
        Pipe pipe = new Pipe("urlSource");
       
        // This will force all URLs to get skipped because of the crawl end time limit.
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlEndTime(0);
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1, defaultPolicy);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy(defaultPolicy);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);

        // Create the output
        BasePath outputPath = makeOutputPath(platform, "testDurationLimitSimple");
        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap statusSink = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(statusSink, contentSink), fetchPipe);
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertFalse(tupleEntryIterator.hasNext());
       
        validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
       
        int numEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            numEntries += 1;
            TupleEntry entry = tupleEntryIterator.next();
            StatusDatum status = new StatusDatum(entry);
            Assert.assertEquals(UrlStatus.SKIPPED_TIME_LIMIT, status.getStatus());
        }
       
        Assert.assertEquals(10, numEntries);
    }
   
    protected void testMaxUrlsPerServer(BixoPlatform platform) throws Exception {
        // Pretend like we have 2 URLs from the same domain
        final int sourceUrls = 2;
        Tap in = makeInputData(platform, "testMaxUrlsPerServer", 1, sourceUrls);

        // Create the fetch pipe we'll use to process these fake URLs
        Pipe pipe = new Pipe("urlSource");
       
        // This will limit us to one URL.
        final int maxUrls = 1;
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1, defaultPolicy);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy(defaultPolicy.getMaxRequestsPerConnection(), maxUrls, BaseFetchJobPolicy.DEFAULT_CRAWL_DELAY);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);

        // Create the output
        BasePath outputPath = makeOutputPath(platform, "testMaxUrlsPerServer");
        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap statusSink = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(statusSink, contentSink), fetchPipe);
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertTrue(tupleEntryIterator.hasNext());
        tupleEntryIterator.next();
        Assert.assertFalse(tupleEntryIterator.hasNext());

        validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
       
        int numSkippedEntries = 0;
        int numFetchedEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            StatusDatum status = new StatusDatum(entry);
            if (status.getStatus() == UrlStatus.SKIPPED_PER_SERVER_LIMIT) {
                numSkippedEntries += 1;
            } else if (status.getStatus() == UrlStatus.FETCHED) {
                numFetchedEntries += 1;
            } else {
                Assert.fail("Unexpected status: " + status.getStatus());
            }
        }
       
        Assert.assertEquals(numFetchedEntries, maxUrls);
        Assert.assertEquals(numSkippedEntries, sourceUrls - maxUrls);
    }
   
    // TODO KKr- re-enable this test when we know how to make it work for
    // the new fetcher architecture.
    /**
    @Test
    public void testPassingAllStatus() throws Exception {
        // Pretend like we have 10 URLs from one domain, to match the
        // 10 cases we need to test.
        Lfs in = makeInputData(1, 10);

        // Create the fetch pipe we'll use to process these fake URLs
        Pipe pipe = new Pipe("urlSource");
       
        // We need to skip things for all the SKIPPED/ABORTED/ERROR reasons in
        // UrlStatus, plus one of the HTTP reasons. Note that we don't do
        // SKIPPED_TIME_LIMIT, since that's hard to test in the middle of testing
        // everything else.
//        SKIPPED_BLOCKED,            // Blocked by robots.txt
//        SKIPPED_UNKNOWN_HOST,       // Hostname couldn't be resolved to IP address
//        SKIPPED_INVALID_URL,        // URL invalid
//        SKIPPED_DEFERRED,           // Deferred because robots.txt couldn't be processed.
//        SKIPPED_BY_SCORER,          // Skipped explicitly by scorer
//        SKIPPED_BY_SCORE,           // Skipped because score wasn't high enough
//        ABORTED_SLOW_RESPONSE,
//        ABORTED_INVALID_MIMETYPE
//        HTTP_NOT_FOUND,
//        ERROR_INVALID_URL,
//        ERROR_IOEXCEPTION,

        FetchPipe fetchPipe = new FetchPipe(pipe, new CustomGrouper(), new CustomScorer(), new CustomFetcher());

        // Create the output
        String outputPath = DEFAULT_OUTPUT_PATH;
        Tap statusSink = new Lfs(new SequenceFile(StatusDatum.FIELDS), outputPath + "/status", true);
        Tap contentSink = new Lfs(new SequenceFile(FetchedDatum.FIELDS), outputPath + "/content", true);

        FlowConnector flowConnector = new FlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(statusSink, contentSink), fetchPipe);
        flow.complete();
       
        Lfs validate = new Lfs(new SequenceFile(FetchedDatum.FIELDS), outputPath + "/content");
        TupleEntryIterator tupleEntryIterator = validate.openForRead(new JobConf());
        Assert.assertFalse(tupleEntryIterator.hasNext());
       
        validate = new Lfs(new SequenceFile(StatusDatum.FIELDS), outputPath + "/status");
        tupleEntryIterator = validate.openForRead(new JobConf());
       
        int numStatus = UrlStatus.values().length;
        boolean returnedStatus[] = new boolean[numStatus];
       
        Fields metaDataFields = new Fields();
        int numEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            numEntries += 1;
            TupleEntry entry = tupleEntryIterator.next();
            StatusDatum status = new StatusDatum(entry);
            int ordinal = status.getStatus().ordinal();
            Assert.assertFalse(returnedStatus[ordinal]);
            returnedStatus[ordinal] = true;
        }
       
        Assert.assertEquals(10, numEntries);
    }
    */
   
    /**
    @SuppressWarnings("serial")
    private static class RandomScoreGenerator implements IScoreGenerator {

        private double _minScore;
        private double _maxScore;
        private Random _rand;
       
        public RandomScoreGenerator(double minScore, double maxScore) {
            _minScore = minScore;
            _maxScore = maxScore;
            _rand = new Random();
        }
     * @throws Exception
       
        @Override
        public double generateScore(GroupedUrlDatum urlTuple) {
            double range = _maxScore - _minScore;
           
            return _minScore + (_rand.nextDouble() * range);
        }
    }
    **/
   
    private Tap makeInputData(BasePlatform platform, String testname, int numDomains, int numPages) throws Exception {
        return makeInputData(platform, testname, numDomains, numPages, null);
    }
   
    private Tap makeInputData(BasePlatform platform, String testname, int numDomains, int numPages, Payload payload) throws Exception {
        String platformName = platform.getClass().getSimpleName();
        BasePath defaultPath = platform.makePath(BASE_INPUT_PATH + testname + "/" + platformName + "/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), defaultPath, SinkMode.REPLACE);
        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());
        for (int i = 0; i < numDomains; i++) {
            for (int j = 0; j < numPages; j++) {
                // Use special domain name pattern so code deep inside of operations "knows" not
                // to try to resolve host names to IP addresses.
                write.add(makeTuple("bixo-test-domain-" + i + ".com", j, payload));
            }
        }
       
        write.close();
        return in;
    }
   
    private Tap makeInputData(BasePlatform platform, String testname, String domain, int numPages, Payload payload) throws Exception {
        String platformName = platform.getClass().getSimpleName();
        BasePath defaultPath = platform.makePath(BASE_INPUT_PATH + testname + "/" + platformName + "/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), defaultPath, SinkMode.REPLACE);
        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());
        for (int j = 0; j < numPages; j++) {
            write.add(makeTuple(domain, j, payload));
        }

        write.close();
        return in;
    }
   
    private BasePath makeOutputPath(BasePlatform platform, String testname) throws Exception {
        String platformName = platform.getClass().getSimpleName();
        return platform.makePath(BASE_OUTPUT_PATH + testname + "/" + platformName + "/out");
    }


    private Tuple makeTuple(String domain, int pageNumber, Payload payload) {
        UrlDatum url = new UrlDatum("http://" + domain + "/page-" + pageNumber + ".html?size=10");
        url.setPayload(payload);
        return url.getTuple();
    }
   
    private static class NoRobotsResponseHandler extends RandomResponseHandler {

        public NoRobotsResponseHandler() {
            super(1000, 10);
        }
       
        public NoRobotsResponseHandler(int length, long duration) {
            super(length, duration);
        }
       
        @Override
        public void handle(String pathInContext, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws HttpException, IOException {
            if (pathInContext.endsWith("/robots.txt")) {
                throw new HttpException(HttpStatus.SC_NOT_FOUND, "No robots.txt");
            } else {
                super.handle(pathInContext, baseRequest, request, response);
            }
        }
    }
   
    @SuppressWarnings("unused")
    private static class NoRobotsHtmlResponseHandler extends AbstractHandler {

        @Override
        public void handle(String pathInContext, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws HttpException, IOException {
            if (pathInContext.endsWith("/robots.txt")) {
                throw new HttpException(HttpStatus.SC_NOT_FOUND, "No robots.txt");
            } else {
                final String template = "<htm><head><title>%s</title></head><body></body></html>";
               
                String htmlResponse = String.format(template, pathInContext.substring(1));
                byte[] content = htmlResponse.getBytes("UTF-8");
                response.setContentLength(content.length);
                response.setContentType("text/html; charset=UTF-8");
                response.setStatus(200);
               
                OutputStream os = response.getOutputStream();
                os.write(content);
                os.flush();
            }
        }
    }
   
    /***********************************************************************
     * Lots of ugly custom classes to support serializable "mocking" for a
     * particular test case. Mockito mocks aren't serializable,
     * or at least I couldn't see an easy way to make this work.
     */

    @SuppressWarnings({"unused" })
    private static class CustomGrouper extends BaseGroupGenerator {

        @Override
        public String getGroupingKey(UrlDatum urlDatum) {
            String url = urlDatum.getUrl();
            if (url.contains("page-0")) {
                return GroupingKey.BLOCKED_GROUPING_KEY;
            } else if (url.contains("page-1")) {
                return GroupingKey.UNKNOWN_HOST_GROUPING_KEY;
            } else if (url.contains("page-2")) {
                return GroupingKey.INVALID_URL_GROUPING_KEY;
            } else if (url.contains("page-3")) {
                return GroupingKey.DEFERRED_GROUPING_KEY;
            } else if (url.contains("page-4")) {
                return GroupingKey.SKIPPED_GROUPING_KEY;
            } else {
                return GroupingKey.makeGroupingKey("domain-0.com", 30000);
            }
        }
    };
   
    /**
    @SuppressWarnings("serial")
    private static class CustomScorer implements IScoreGenerator {

        @Override
        public double generateScore(GroupedUrlDatum urlDatum) {
            String url = urlDatum.getUrl();
            if (url.contains("page-5")) {
                return 0.0;
            } else {
                return 10.0;
            }
        }
    };
    **/
   
    private static class MaxUrlFetcherPolicy extends FetcherPolicy {
        private int _maxUrls;
       
        public MaxUrlFetcherPolicy(int maxUrls) {
            super();

            _maxUrls = maxUrls;
        }
       
        @Override
        public int getMaxUrls() {
            return _maxUrls;
        }
    }
   
    @SuppressWarnings({"unused" })
    private static class CustomFetcher extends BaseFetcher {

        public CustomFetcher() {
            super(1, new MaxUrlFetcherPolicy(4), ConfigUtils.BIXO_TEST_AGENT);
        }
       
        @Override
        public FetchedDatum get(ScoredUrlDatum scoredUrl) throws BaseFetchException {
            String url = scoredUrl.getUrl();
            if (url.contains("page-6")) {
                throw new AbortedFetchException(url, AbortedFetchReason.SLOW_RESPONSE_RATE);
            } else if (url.contains("page-7")) {
                throw new HttpFetchException(url, "msg", HttpStatus.SC_GONE, new HttpHeaders());
            else if (url.contains("page-8")) {
                throw new IOFetchException(url, new IOException());
            } else if (url.contains("page-9")) {
                throw new UrlFetchException(url, "msg");
            } else {
                throw new RuntimeException("Unexpected page");
            }
        }

      @Override
      public void abort() {
          // Do nothing
      }

    };


   

}
TOP

Related Classes of bixo.pipes.AbstractFetchPipeTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.