Package com.flaptor.hounder.crawler.pagedb

Examples of com.flaptor.hounder.crawler.pagedb.PageDB.open()


        Crawler crawler = new Crawler();
        Config config = Config.getConfig("crawler.properties");
        UrlPatterns hotspots = new UrlPatterns(config.getString("hotspot.file"));
        PageFilter pageFilter = new PageFilter(0, new int[]{3}, 0L, 0L, 0L);
        PageDB destPageDB = new PageDB(name+".test");
        destPageDB.open(PageDB.WRITE+PageDB.UNSORTED);
        PageDB pdb = new PageDB(name);
        pdb.open(PageDB.READ);
        long count = 0;
        long start = System.currentTimeMillis();
        for (Page p : pdb) {
View Full Code Here


        UrlPatterns hotspots = new UrlPatterns(config.getString("hotspot.file"));
        PageFilter pageFilter = new PageFilter(0, new int[]{3}, 0L, 0L, 0L);
        PageDB destPageDB = new PageDB(name+".test");
        destPageDB.open(PageDB.WRITE+PageDB.UNSORTED);
        PageDB pdb = new PageDB(name);
        pdb.open(PageDB.READ);
        long count = 0;
        long start = System.currentTimeMillis();
        for (Page p : pdb) {
            String url = p.getUrl();
            pageFilter.shouldWrite(destPageDB, p);
View Full Code Here

            logger.error("Can't redistribute a non-distributed PageDB");
        } else {
            try {
                PageDB oldPageDB = new PageDB(pagedbDir);
                DPageDB newPageDB = new DPageDB(pagedbDir+".new");
                oldPageDB.open(PageDB.READ);
                newPageDB.open(DPageDB.WRITE + DPageDB.UNSORTED);
                long total = oldPageDB.getSize();
                long done = 0;
                for (Page page : oldPageDB) {
                    newPageDB.addPage(page);
View Full Code Here

                // There is a pagedb file, and the factory is doing nothing.
                if (injectedPageDB.exists() && (null == factory)) {
                    try {
                        PageDB pageDB = new PageDB(injectedPagedbDir);
                        pageDB.open(PageDB.READ);
                        factory = new FetchlistFactory(pageDB,tmpPageDB,progress);

                        FetchList fetchlist = factory.getNextFetchlist();
                        while (null != fetchlist && running() && !cycleFinished) {
                            injectedFetchlistQueue.enqueueNoBlock(fetchlist);
View Full Code Here

        return size;
    }

    private Set<String> pageDBlist () throws Exception {
        PageDB db = new PageDB(tmpDir+"/testdb");
        db.open(PageDB.READ);
        Set<String> res = new HashSet<String>();
        for (Page page : db) {
            String url = page.getUrl();
            String[] part = url.split("[0-9]");
            int start = part[0].length();
View Full Code Here

            server = new WebServer(8085);
            server.addResourceHandler("/", tmpDir+"/web");
            server.start();

            PageDB db = new PageDB(tmpDir+"/testdb");
            db.open(PageDB.WRITE);
            db.addPage(in);
            db.close();

            crawler = new Crawler();
View Full Code Here

            do {
                tries++;

                crawler.crawl(1);

                db.open(PageDB.READ);
                Iterator<Page> pages = db.iterator();
                assertTrue("The crawler lost or discarded the test page", pages.hasNext());
                out = pages.next();
                assertFalse("The crawler has more than the test page", pages.hasNext());
                db.close();
View Full Code Here

            server = new WebServer(8087);
            server.addResourceHandler("/", tmpDir+"/web");
            server.start();

            PageDB db = new PageDB(tmpDir+"/testdb");
            db.open(PageDB.WRITE);
            db.addPage(in);
            db.close();

            crawler = new Crawler();
View Full Code Here

            crawler = new Crawler();

            crawler.crawl(2);

            db.open(PageDB.READ);
            Iterator<Page> pages = db.iterator();
            assertTrue("The crawler lost or discarded all test pages", pages.hasNext());
            one = pages.next();
            assertTrue("The crawler lost or discarded the second test page", pages.hasNext());
            two = pages.next();
View Full Code Here

        Page page1 = PageTest.randomPage();
        page1.setUrl("http://example.com/test0=0");
        IRemotePageCatcher stubCatcher= distributor.getCatcher(page1);
        stubCatcher.addPage(page1);
        PageDB db = localCatcher.getCatch();
        db.open(PageDB.READ);
        Iterator<Page> pages = db.iterator();
        assertTrue("The page sent through rmi did not survive the adventure.", pages.hasNext());
        Page page2 = pages.next();
        assertTrue("The page has been changed by the trip through rmi:\n  1: "+page1+"\n  2: "+page2, page1.equals(page2));
        assertFalse("Sent one page through rmi and more than one came out the other end.", pages.hasNext());
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.