Examples of com.flaptor.hounder.crawler.modules.FetchDocument

Package com.flaptor.hounder.crawler.modules

Examples of com.flaptor.hounder.crawler.modules.FetchDocument

com.flaptor.hounder.crawler.modules.FetchDocument
@author Flaptor Development Team


        Iterator<FetchDocument> iter = fetchdata.iterator();


        // Feed the thread pool queue.
        while (iter.hasNext() && Crawler.running()) {
            FetchDocument doc = iter.next();
            Runnable processorJob = new ProcessorJob(doc, oldPageDB, newPageDB);
            pool.execute(processorJob);
        }


        // So the fetchdata ended, or the crawler is no longer running.

View Full Code Here

                    // At this point, we don't hace the fetched data, but we do 
                    // have the original and the new url, so we store the mapping.
                    redirects.put(rec.newurl, rec.origurl);
                } else {
                    // This is a non-redirected page.
                    fetchdata.addDoc(new FetchDocument(page, rec.origurl, rec.content, rec.header, success(rec), recoverable(rec), internalError(rec), true));
                }
            }
        }
        // Now go through the redirects.
        for (SegmentRecord rec : unknownPages) {
            FetchDocument doc = null;
            if (redirects.containsKey(rec.newurl)) {
                rec.origurl = redirects.get(rec.newurl);
                page = fetchlist.getPage(rec.origurl);
                if (null != page) {
                    // Override URL with fetched URL if the fetcher is configured to do so
                    if (!keepUrl) {
                        try {
                            page.setUrl(rec.newurl);
                        } catch (MalformedURLException e) {
                            logger.debug("Malformed redirect url. Keeping original url.",e);
                        }
                    }
                    // finally we could reconstruct the redirect and can now store the page.
                    doc = new FetchDocument(page, rec.origurl, rec.content, rec.header, success(rec), recoverable(rec), internalError(rec), true);
                }
                if (null != doc) {
                    fetchdata.addDoc(doc);
                } else {
                    logger.error("Unknown page fetched. This is a bug in Nutch9Fetcher.");

View Full Code Here

                    Map<String,String> header = new HashMap<String,String>(); // this info is lost, it should be stored in the cache along with the page contents.
                    boolean success = true;
                    boolean recoverable = true;
                    boolean internalError = false;
                    boolean changed = false;
                    doc = new FetchDocument(page, url, content, header, success, recoverable, internalError, changed);
                }
            }
        }

View Full Code Here

            return (null != doc);
        }




        public synchronized FetchDocument next () {
            FetchDocument ret = doc;
            if (null == doc) {  
                throw new NoSuchElementException("No more pages in the pagedb and cache");
            }
            advance();
            return ret;

View Full Code Here

        cbv = new ConstantBoostValue(config);
    }


    @TestInfo(testType = TestInfo.TestType.UNIT)
    public void testHasValue() throws Exception {
        FetchDocument doc = new FetchDocument(new Page("",1f));
        assertTrue(cbv.hasValue(doc));
    }

View Full Code Here

        assertTrue(cbv.hasValue(doc));
    }


    @TestInfo(testType = TestInfo.TestType.UNIT)
    public void testGetValue() throws Exception {
        FetchDocument doc = new FetchDocument(new Page("",1f));
        assertTrue(new Double(1000).equals(cbv.getValue(doc)));
    }

View Full Code Here

                text = TestUtils.randomText(5,50);
                title = TestUtils.randomText(2,5);
                content = text.getBytes();
                header.put("length",String.valueOf(content.length));
            }
            FetchDocument doc = new FetchDocument(page, url, text, title, links, content, header, success, recoverable, internalError, changed);
            fetchdata.addDoc(doc);
        }
        return fetchdata;
    }

View Full Code Here

    }


    @TestInfo(testType = TestInfo.TestType.UNIT)
    public void testApplyBoost() throws MalformedURLException {
        Page page = new Page("http://test.flaptor.com",1f);
        FetchDocument doc = new FetchDocument(page);
        kbm.applyBoost(doc,times);


        String text = doc.getIndexableAttribute(field).toString();
        int found = 0;
        for (String value : text.split(" ")) {
            if (value.equals(keyword)){
                found++;
            }

View Full Code Here

TOP

Related Classes of com.flaptor.hounder.crawler.modules.FetchDocument

com.flaptor.hounder.crawler.FetchdataProcessor

com.flaptor.hounder.crawler.modules.boost.ConstantBoostValueTest

com.flaptor.hounder.crawler.modules.boost.KeywordBoostMethodTest

com.flaptor.hounder.crawler.Nutch9Fetcher

com.flaptor.hounder.crawler.PageCache$PageCacheIterator

com.flaptor.hounder.crawler.pagedb.Link

com.flaptor.hounder.crawler.SimFetcher

com.flaptor.util.Config

com.flaptor.util.parser.HtmlParser

com.flaptor.util.parser.IParser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.