Package com.flaptor.hounder.crawler.pagedb

Examples of com.flaptor.hounder.crawler.pagedb.PageDB


    public void test(String name) throws Exception {
        Crawler crawler = new Crawler();
        Config config = Config.getConfig("crawler.properties");
        UrlPatterns hotspots = new UrlPatterns(config.getString("hotspot.file"));
        PageFilter pageFilter = new PageFilter(0, new int[]{3}, 0L, 0L, 0L);
        PageDB destPageDB = new PageDB(name+".test");
        destPageDB.open(PageDB.WRITE+PageDB.UNSORTED);
        PageDB pdb = new PageDB(name);
        pdb.open(PageDB.READ);
        long count = 0;
        long start = System.currentTimeMillis();
        for (Page p : pdb) {
            String url = p.getUrl();
            pageFilter.shouldWrite(destPageDB, p);
//            hotspots.match(url);
//            Crawler.urlFilter(url);
            destPageDB.addPage(p);
            if (++count % 100000 == 0) {
                long now = System.currentTimeMillis();
                System.out.println("TRIMMING: "+count+" pages at "+((1000*count)/(now-start))+" pages/sec");
            }
        }
        pdb.close();
        destPageDB.close();

    }
View Full Code Here


            if (null != fileCache) {
                try {
                    ModulesManager.getInstance().unloadModule(cacheModule);
                    hasCache = true;
                    long seen = 0;
                    PageDB pagedb = new PageDB(pagedbDir);
                    long total = pagedb.getSize();
                    declareStartCycle(pagedb);
                    PageCache pageCache = new PageCache(pagedb, fileCache);
                    FetchdataProcessor processor = new FetchdataProcessor();
                    processor.processFetchdata(pageCache, pagedb, new NoPageDB());
                    pagedb.close();
                    declareEndCycle(pagedb); // Note: normally this would be the newPageDB, but in a refresh there is no such thing.
                    ModulesManager.getInstance().applyCommand("optimize");
                } catch (IOException e) {
                    logger.error(e,e);
                }
View Full Code Here

    public void redistribute () {
        if (!distributed) {
            logger.error("Can't redistribute a non-distributed PageDB");
        } else {
            try {
                PageDB oldPageDB = new PageDB(pagedbDir);
                DPageDB newPageDB = new DPageDB(pagedbDir+".new");
                oldPageDB.open(PageDB.READ);
                newPageDB.open(DPageDB.WRITE + DPageDB.UNSORTED);
                long total = oldPageDB.getSize();
                long done = 0;
                for (Page page : oldPageDB) {
                    newPageDB.addPage(page);
                    if (++done % 10000 == 0) {
                        logger.info("Redistributed "+done+" of "+total+" pages.");
                    }
                }
                oldPageDB.close();
                newPageDB.close();
                String oldName = oldPageDB.getDir();
                PageDB tmpPageDB = new PageDB(pagedbDir+".tmp");
                tmpPageDB.deleteDir(false);
                if (attempt(oldPageDB.rename(tmpPageDB.getDir()), "renaming pagedb -> pagedb.tmp")) {
                    if (attempt(newPageDB.rename(oldName), "renaming pagedb.new -> pagedb")) {
                        if (attempt(tmpPageDB.deleteDir(false), "deleting pagedb.tmp")) {
                            logger.info("Done redistributing.");
                        }
                    }
                }
            } catch (IOException e) {
View Full Code Here

     * Delete all temporary files and directories.
     * Usefull for cleanup after testing.
     */
    public void cleanup() {
        CrawlerProgress.cleanup();
        PageDB tmpPageDB;
        if (distributed) {
            tmpPageDB = new DPageDB(pagedbDir + ".tmp", pageCatcher);
        } else {
            tmpPageDB = new PageDB(pagedbDir + ".tmp");
        }
        PageDB newPageDB = new PageDB(pagedbDir + ".new");
        tmpPageDB.deleteDir(true);
        newPageDB.deleteDir(true);
    }
View Full Code Here

                // start queues
                if (fetchlistQueue.isClosed()) fetchlistQueue.reset();
                if (fetchdataQueue.isClosed()) fetchdataQueue.reset();
               
                // run crawl cycle
                PageDB newPageDB = runSingleCrawlCycle(processor, createNewPageDB);

                logger.info("Finished crawl cycle");

                cyclesCounter++;
                if (cycles == 0 || (cycles > 0 && cycles == cyclesCounter)) {
                    stopCrawler();
                }

                // TODO: this should be done by the Hounder indexer, not the crawler.
                if (running()) {
                    if (shouldOptimizeAfterCycle(newPageDB.getCycles())) {
                        processor.optimizeIndex();
                    }
                }
            }
View Full Code Here

        // start the fetch server
        FetchServer fetchserver = new FetchServer(fetcher, this);
        fetchserver.start();

        // prepare the pagedbs
        PageDB oldPageDB = new PageDB(pagedbDir);
        PageDB tmpPageDB;
        PageDB newPageDB;
        if (createNewPageDB) {
            if (distributed) {
                tmpPageDB = new DPageDB(pagedbDir + ".tmp", pageCatcher);
//                if (starting) {
//                    logger.info("Waiting for other nodes to start...");
//                    ((DPageDB)tmpPageDB).synch();
//                    logger.info("All nodes started");
//                    starting = false;
//                }
            } else {
                tmpPageDB = new PageDB(pagedbDir + ".tmp");
            }
            newPageDB = new PageDB(pagedbDir + ".new");
        } else {
            tmpPageDB = new NoPageDB();
            newPageDB = new NoPageDB();
        }
        // delete leftover new pagedb
        newPageDB.deleteDir(false);
        // prepare the new pagedb
        oldPageDB.open(PageDB.READ);

        // Crawl recovery attempt
        boolean skipFetch = false;
        long skip = 0;
       
        progress = CrawlerProgress.restartCrawlerProgress();
        if (null != progress) {
            // the previous cycle was interrupted
            switch (progress.stage()) {
                case CrawlerProgress.START:
                case CrawlerProgress.STOP:
                    logger.info("Last crawler state is either before starting or after finishing, will start next cycle.");
                    progress = null;
                    break;
                case CrawlerProgress.FETCH:
                    if ((progress.cycle() == oldPageDB.getCycles() + 1)) {
                        skip = progress.processed();
                        logger.info("Crawler was interrupted while fetching at cycle "+progress.cycle()+", will continue current cycle skipping "+skip+" docs.");
                        tmpPageDB.open(PageDB.WRITE + PageDB.APPEND);
                    } else {
                        logger.info("Last crawler report inconsistent with pagedb state, will restart.");
                        progress = null;
                    }
                    break;
                case CrawlerProgress.SORT:
                    // fall through
                case CrawlerProgress.MERGE:
                    logger.info("Crawler was interrupted while sorting at cycle "+progress.cycle()+", will continue current cycle.");
                    tmpPageDB.open(PageDB.WRITE + PageDB.APPEND); // this will force a sort upon closing
                    // fall through
                case CrawlerProgress.TRIM:
                    if (progress.stage() == CrawlerProgress.TRIM) {
                        logger.info("Crawler was interrupted while trimming at cycle "+progress.cycle()+", will continue current cycle.");
                    }
                    skipFetch = true;
                    break;
                default:
                    logger.error("Unknown crawler state report, will restart.");
                    progress = null;
                    break;
            }
        }
        if (null == progress) {
            // there was no interrupted previous cycle or it was inconsistent
            tmpPageDB.deleteDir(false);
            tmpPageDB.open(PageDB.WRITE);
            tmpPageDB.setNextCycleOf(oldPageDB);
            progress = new CrawlerProgress(tmpPageDB.getCycles());
        }
       
        tmpPageDB.setProgressHandler(progress);
        if (!skipFetch) {
            if (0 == skip) {
                progress.startFetch(oldPageDB.getSize(), oldPageDB.getFetchedSize());
                logger.info("Starting crawl cycle " + (oldPageDB.getCycles()+1));
            } else {
                logger.info("Continuing crawl cycle " + (oldPageDB.getCycles()+1));
            }
            declareStartCycle(oldPageDB);

            FetchlistQueueMonitor fetchlistFeeder = new FetchlistQueueMonitor(oldPageDB, tmpPageDB, skip);
            InjectedFetchlistQueueMonitor injectedFetchlistFeeder = new InjectedFetchlistQueueMonitor(tmpPageDB);
            FetchdataQueueMonitor fetchdataConsumer = new FetchdataQueueMonitor(oldPageDB, tmpPageDB, processor);
            cycleFinished = false;
            fetchlistFeeder.start();
            injectedFetchlistFeeder.start();
            fetchdataConsumer.start();

            // This is where the main thread spends its time while the crawl cycle takes place.
            // Wait until the fetchlist and fetchdata threads are done
            synchronized(cycleFinishedMonitor) {
                while (running() && !cycleFinished) {
                    logger.debug("Waiting: running="+running()+" cycleFinished="+cycleFinished+" fetchList="+fetchlistQueue.size()+" injectedFetchList="+injectedFetchlistQueue.size()+" fetchData="+fetchdataQueue.size());
                    cycleFinishedMonitor.wait(60000); // wake up every minute or when the cycle finishes
                }
            }
            logger.debug("Waiting no more: running="+running()+" cycleFinished="+cycleFinished+" fetchList="+fetchlistQueue.size()+" injectedFetchList="+injectedFetchlistQueue.size()+" fetchData="+fetchdataQueue.size());
        }
       
        if (running()) {
            progress.report();
            logger.debug("Closing old and temporary pagedbs");
            oldPageDB.close();
            tmpPageDB.close();
            logger.debug("Old and temporary pagedbs closed");

            if (createNewPageDB) {
                // dedup & trim
                new PageDBTrimmer().trimPageDB(tmpPageDB, newPageDB, progress);

                // check the trimmed pagedb size
                if (protectAgainstEmptyPageDB && newPageDB.getSize() == 0) {
                    logger.error("The new PageDB is empty, will stop the crawler before replacing the old PageDB. Please check the hotspots, modules and other settings before restarting.");
                    stopCrawler();
                }
               
                if (running()) {
                    boolean ok = false;
                    String oldName = oldPageDB.getDir();
                    if (attempt(tmpPageDB.deleteDir(false), "deleting pagedb.tmp")) {
                        if (attempt(oldPageDB.rename(tmpPageDB.getDir()), "renaming pagedb -> pagedb.tmp")) {
                            if (attempt(newPageDB.rename(oldName), "renaming pagedb.new -> pagedb")) {
                                if (attempt(tmpPageDB.deleteDir(false), "deleting pagedb.tmp (2)")) {
                                    ok = true;
                                }
                            }
                        }
View Full Code Here

                }

                // There is a pagedb file, and the factory is doing nothing.
                if (injectedPageDB.exists() && (null == factory)) {
                    try {
                        PageDB pageDB = new PageDB(injectedPagedbDir);
                        pageDB.open(PageDB.READ);
                        factory = new FetchlistFactory(pageDB,tmpPageDB,progress);

                        FetchList fetchlist = factory.getNextFetchlist();
                        while (null != fetchlist && running() && !cycleFinished) {
                            injectedFetchlistQueue.enqueueNoBlock(fetchlist);
                            fetchlist = factory.getNextFetchlist();
                        }

                        // if the pageDB is exhausted, we can delete it
                        if (null == fetchlist) {
                            // End with this pageDB
                            logger.debug("Injected pagedb exhausted. Deleting it");
                            pageDB.close();
                            pageDB.deleteDir(true);
                            // Discard previous factory
                            factory = null;
                        }
                    } catch (IOException e) {
                        logger.error(e,e);
View Full Code Here

        } else if ("delete".equals(command.toString())) {
            FetchDocument doc = ((CommandWithDoc)command).getDoc();
            Page page = doc.getPage();
            deleteFromIndex(page);
        } else if ("startCycle".equals(command.toString())) {
            PageDB pagedb = ((CommandWithPageDB)command).getPageDB();
            scoreThreshold = new float[11];
            for (int i = 0; i < 11; i++) {
                scoreThreshold[i] = pagedb.getScoreThreshold(i*10);
            }
            antiScoreThreshold = new float[11];
            for (int i = 0; i < 11; i++) {
                antiScoreThreshold[i] = pagedb.getAntiScoreThreshold(i*10);
            }           
        }
    }
View Full Code Here

        preparePageDB (web, startPage);
    }


    private long pageDBSize () throws IOException {
        PageDB db = new PageDB(tmpDir+"/testdb");
        long size = db.getSize();
        db.close();
        return size;
    }
View Full Code Here

        db.close();
        return size;
    }

    private Set<String> pageDBlist () throws Exception {
        PageDB db = new PageDB(tmpDir+"/testdb");
        db.open(PageDB.READ);
        Set<String> res = new HashSet<String>();
        for (Page page : db) {
            String url = page.getUrl();
            String[] part = url.split("[0-9]");
            int start = part[0].length();
            int end = url.length() - part[part.length-1].length();
            String id = url.substring(start,end);
            res.add(id);
        }
        db.close();
        return res;
    }
View Full Code Here

TOP

Related Classes of com.flaptor.hounder.crawler.pagedb.PageDB

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.