Package com.googlecode.flaxcrawler.model

Examples of com.googlecode.flaxcrawler.model.Page


     * @return
     */
    public Page download(URL url) throws DownloadException {
        Request request = createRequest(url);

        Page page = null;

        for (int i = 0; i < triesCount; i++) {
            try {
                log.debug("Downloading from " + url + ", try number " + (i + 1));

                // Using proxy
                Proxy proxy = proxyController == null ? null : proxyController.getProxy();

                // If head request is needed - executing and checking constraints
                if (headRequest) {
                    // Sending HEAD request using specified proxy
                    Page headPage = headRequest(request, proxy);

                    // This is a redirect
                    if (headPage != null && headPage.getResponseCode() >= 300 && headPage.getResponseCode() < 400) {
                        log.debug("Server redirected our request to " + url);
                        // No need to request it again
                        return headPage;
                    }

                    // Error response code
                    if (headPage.getResponseCode() >= 400) {
                        log.debug("Cannot download " + request.getUrl() + (proxy == null ? "" : " through proxy " + proxy) + ", response code is " + headPage.getResponseCode());
                        if (i == (triesCount - 1)) {
                            return headPage;
                        } else {
                            waitForRetry(request);
                            continue;
View Full Code Here


     * @param responseTime
     * @return
     */
    protected Page createPage(Request request, byte[] content, int responseCode, Map<String, String> responseHeaders, String encoding, long responseTime) {
        log.debug("Response code from " + request.getUrl() + " is " + responseCode);
        Page page = new Page(request.getUrl(), responseHeaders, responseCode, encoding, responseTime, content);
        return page;
    }
View Full Code Here

                deferCrawlerTask(crawlerTask);
                return;
            }

            try {
                Page page = crawler.crawl(crawlerTask);
                processPage(page, crawlerTask);
            } finally {
                log.debug("Stopping processing task " + crawlerTask.getUrl());
            }
        }
View Full Code Here

        DefaultCrawler crawler = new DefaultCrawler();
        crawler.setDownloaderController(new DefaultDownloaderController());
        crawler.setParserController(new DefaultParserController());

        CrawlerTask crawlerTask = new CrawlerTask("http://www.wikipedia.org/", 0);
        Page page = crawler.crawl(crawlerTask);

        assertNotNull(page);
        assertTrue(page.getLinks().size() > 0);
    }
View Full Code Here

        DefaultDownloader downloader = new DefaultDownloader();
        downloader.setProxyController(proxyController);
        downloader.setTriesCount(3);

        Page page = downloader.download(new URL("http://vipzone.ws"));
        assertNotNull(page);

        DefaultParser parser = new DefaultParser();
        parser.parse(page);

        assertNotNull(page.getLinks());

        for (URL url : page.getLinks()) {
            System.out.println(url.toString());
        }
    }
View Full Code Here

        URL url = new URL(crawlerTask.getUrl());
        log.debug("Getting downloader for " + url + "...");
        Downloader downloader = downloaderController.getDownloader(url);
        log.debug("Downloading from " + url + "...");

        Page page = null;

        try {
            page = downloader.download(url);

            if (page != null) {
                crawlerTask.setTimeDownloaded(new Date());

                if (page.getResponseCode() == HttpURLConnection.HTTP_OK) {
                    log.debug(url + " was downloaded successfully. Download time " + page.getResponseTime());
                    Parser parser = parserController.getParser(page);
                    log.debug("Parsing " + url);
                    parser.parse(page);
                    crawlerTask.setTimeParsed(new Date());
                    log.debug(url + " has been parsed. " + page.getLinks().size() + " links were found");
                }
            }
        } finally {
            afterCrawl(crawlerTask, page);
        }
View Full Code Here

        downloader.setLoginUrl("http://qiq.ru/?action=login");
        downloader.setPostData("login=lex9889&pass=qweasdzxc");
        downloader.setProxyController(proxyController);
        downloader.setTriesCount(3);

        Page page = downloader.download(new URL("http://qiq.ru/"));
        assertNotNull(page);

        System.out.println("Response time: " + page.getResponseTime());
        System.out.println("Content length: " + page.getContent().length);
        System.out.println("Response code: " + page.getResponseCode());
        System.out.println("Content charset: " + page.getCharset());
        System.out.println("Content encoding: " + page.getContentEncoding());

    }
View Full Code Here

        DefaultDownloader downloader = new DefaultDownloader();
        downloader.setProxyController(proxyController);
        downloader.setTriesCount(3);

        Page page = downloader.download(new URL("http://www.wikipedia.org"));
        assertNotNull(page);

        System.out.println("Response time: " + page.getResponseTime());
        System.out.println("Content length: " + page.getContent().length);
        System.out.println("Response code: " + page.getResponseCode());
        System.out.println("Content charset: " + page.getCharset());
        System.out.println("Content encoding: " + page.getContentEncoding());
    }
View Full Code Here

TOP

Related Classes of com.googlecode.flaxcrawler.model.Page

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.