Examples of FetchedDatum


Examples of bixo.datum.FetchedDatum

    public static BaseRobotRules getRobotRules(BaseFetcher fetcher, BaseRobotsParser parser, URL robotsUrl) {
       
        try {
            String urlToFetch = robotsUrl.toExternalForm();
            ScoredUrlDatum scoredUrl = new ScoredUrlDatum(urlToFetch);
            FetchedDatum result = fetcher.get(scoredUrl);

            // HACK! DANGER! Some sites will redirect the request to the top-level domain
            // page, without returning a 404. So look for a response which has a redirect,
            // and the fetched content is not plain text, and assume it's one of these...
            // which is the same as not having a robots.txt file.
           
            String contentType = result.getContentType();
            boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain"));
            if ((result.getNumRedirects() > 0) && !isPlainText) {
                return parser.failedFetch(HttpStatus.SC_GONE);
            }
           
            return parser.parseContent(urlToFetch, result.getContentBytes(), result.getContentType(),
                            fetcher.getUserAgent().getAgentName());
        } catch (HttpFetchException e) {
            return parser.failedFetch(e.getHttpStatus());
        } catch (IOFetchException e) {
            return parser.failedFetch(HttpStatus.SC_INTERNAL_SERVER_ERROR);
View Full Code Here

Examples of bixo.datum.FetchedDatum

        return _flowProcess;
    }
   
    private void skipUrls(List<ScoredUrlDatum> urls, UrlStatus status, String traceMsg) {
        for (ScoredUrlDatum datum : urls) {
            FetchedDatum result = new FetchedDatum(datum);
            Tuple tuple = result.getTuple();
            tuple.add(status.toString());
            _collector.add(BixoPlatform.clone(tuple, _flowProcess));
        }

        _flowProcess.increment(FetchCounters.URLS_SKIPPED, urls.size());
View Full Code Here

Examples of bixo.datum.FetchedDatum

            // TODO KKr - when fetching the last item, send a Connection: close
            // header to let the server know it doesn't need to keep the socket open.
            Iterator<ScoredUrlDatum> iter = _items.iterator();
            while (!Thread.interrupted() && iter.hasNext()) {
                ScoredUrlDatum item = iter.next();
                FetchedDatum result = new FetchedDatum(item);
               
                // We use status as an extra field on the end of of FetchedDatum that lets
                // us generate a full status pipe, and also a content pipe that only has
                // entries which were fetched. By keying off the type (string == OK,
                // BaseFetchException == bad) the FetchPipe can do this magic.
                Comparable status = null;

                long fetchStartTime = System.currentTimeMillis();
               
                try {
                    process.increment(FetchCounters.URLS_FETCHING, 1);
                    result = _httpFetcher.get(item);
                    long deltaTime = System.currentTimeMillis() - fetchStartTime;

                    process.increment(FetchCounters.FETCHED_TIME, (int)deltaTime);
                    process.increment(FetchCounters.URLS_FETCHED, 1);
                    process.increment(FetchCounters.FETCHED_BYTES, result.getContentLength());
                    process.setStatus(Level.SLF4J_TRACE, "Fetched " + result);

                    status = UrlStatus.FETCHED.toString();
                   
                    // TODO - check keep-alive response (if present), and close the connection/delay
                    // for some amount of time if we exceed this limit.
                } catch (AbortedFetchException e) {
                    LOGGER.info("Aborted while fetching " + item.getUrl() + " due to " + e.getAbortReason());
                    if (e.getAbortReason() == AbortedFetchReason.INTERRUPTED) {
                        process.increment(FetchCounters.URLS_SKIPPED, 1);
                       
                        // Make sure our loop terminates.
                        Thread.currentThread().interrupt();
                    } else {
                        process.increment(FetchCounters.URLS_FAILED, 1);
                    }
                   
                    status = (Comparable)e;
                } catch (BaseFetchException e) {
                    LOGGER.info("Fetch exception while fetching " + item.getUrl(), e);
                    process.increment(FetchCounters.URLS_FAILED, 1);

                    // We can do this because each of the concrete subclasses of BaseFetchException implements
                    // WritableComparable
                    status = (Comparable)e;
                } catch (Exception e) {
                    LOGGER.warn("Unexpected exception while fetching " + item.getUrl(), e);

                    process.increment(FetchCounters.URLS_FAILED, 1);
                    status = new IOFetchException(item.getUrl(), new IOException(e));
                } finally {
                    process.decrement(FetchCounters.URLS_FETCHING, 1);

                    Tuple tuple = result.getTuple();
                    tuple.add(status);
                    _fetchMgr.collect(tuple);

                    // Figure out how long it's been since the start of the request.
                    long fetchInterval = System.currentTimeMillis() - fetchStartTime;

                    // We want to avoid fetching faster than a max acceptable rate. Note that we always do
                    // this, even if there's not another page, so that this setting will have impact even
                    // if the next fetch set is ready right away.
                    if (fetchInterval < minPageFetchInterval) {
                        long delay = minPageFetchInterval - fetchInterval;
                        LOGGER.trace(String.format("FetchTask: sleeping for %dms", delay));

                        try {
                            Thread.sleep(delay);
                        } catch (InterruptedException e) {
                            LOGGER.warn("FetchTask interrupted!");
                            Thread.currentThread().interrupt();
                            continue;
                        }
                    }
                }
            }
           
            // While we still have entries, we need to write them out to avoid losing them.
            while (iter.hasNext()) {
                ScoredUrlDatum item = iter.next();
                FetchedDatum result = new FetchedDatum(item);
                process.increment(FetchCounters.URLS_SKIPPED, 1);
                AbortedFetchException status = new AbortedFetchException(item.getUrl(), AbortedFetchReason.INTERRUPTED);
               
                Tuple tuple = result.getTuple();
                tuple.add(status);
               _fetchMgr.collect(tuple);
            }
        } catch (Throwable t) {
            LOGGER.error("Exception while fetching", t);
View Full Code Here

Examples of bixo.datum.FetchedDatum

        @SuppressWarnings("rawtypes")
        @Override
        public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
            TupleEntry entry = funcCall.getArguments();
            FetchedDatum fd = new FetchedDatum(entry);
           
            // Get the fetch status that we hang on the end of the tuple,
            // after all of the FetchedDatum fields.
            Object result = entry.getObject(_fieldPos);
            StatusDatum status;
           
            // Note: Here we share the payload of the FetchedDatum with the
            // StatusDatum we're about to emit, but since we let go after we
            // emit, there shouldn't be an issue with this sharing.
            if (result instanceof String) {
                UrlStatus urlStatus = UrlStatus.valueOf((String)result);
                if (urlStatus == UrlStatus.FETCHED) {
                    status = new StatusDatum(fd.getUrl(), fd.getHeaders(), fd.getHostAddress(), fd.getPayload());
                } else {
                    status = new StatusDatum(fd.getUrl(), urlStatus, fd.getPayload());
                }
            } else if (result instanceof BaseFetchException) {
                status = new StatusDatum(fd.getUrl(), (BaseFetchException)result, fd.getPayload());
            } else {
                throw new RuntimeException("Unknown type for fetch status field: " + result.getClass());
            }
           
            funcCall.getOutputCollector().add(BixoPlatform.clone(status.getTuple(), process));
View Full Code Here

Examples of bixo.datum.FetchedDatum

       
        // Set the location to a fixed value, so that when we're processing entries from
        // the URL DB that might have been set using fake content, we know to ignore the
        // refetch time if we're doing a real fetch.
        headers.add(HttpHeaderNames.CONTENT_LOCATION, FAKE_CONTENT_LOCATION);
        FetchedDatum result = new FetchedDatum(url, url, System.currentTimeMillis(), headers, new ContentBytes(content), "text/html", 100000);
        result.setPayload(payload);
        return result;
    }
View Full Code Here

Examples of bixo.datum.FetchedDatum

            return false;
        }
       
        @Override
        public void operate(FlowProcess flowProcess, FunctionCall<NullContext> functionCall) {
            FetchedDatum fetchedDatum = new FetchedDatum(functionCall.getArguments());
           
            try {
                ParsedDatum parseResult = _parser.parse(fetchedDatum);
                _flowProcess.increment(ParserCounters.DOCUMENTS_PARSED, 1);
                functionCall.getOutputCollector().add(BixoPlatform.clone(parseResult.getTuple(), flowProcess));
            } catch (Exception e) {
                LOGGER.warn("Error processing " + fetchedDatum.getUrl(), e);
                _flowProcess.increment(ParserCounters.DOCUMENTS_FAILED, 1);
                // TODO KKr - don't lose datums for documents that couldn't be parsed
            }
        }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.