Package cn.edu.hfut.dmic.webcollector.model

Examples of cn.edu.hfut.dmic.webcollector.model.CrawlDatum


     * @return 下一个符合正则规则的爬取任务,如果没有符合规则的任务,返回null
     */
    @Override
    public CrawlDatum next() {
        while (true) {
            CrawlDatum crawldatum = generator.next();
            if (crawldatum == null) {
                return null;
            }
            String url = crawldatum.getUrl();
            int state=0;
            for (String nregex : negative) {
                if (Pattern.matches(nregex, url)) {
                   state=1;
                   break;
View Full Code Here


        File fetchFile = new File(getSegmentPath(), "fetch/info.avro");
        File parseFile = new File(getSegmentPath(), "parse_data/info.avro");

        DbReader<CrawlDatum> reader = new DbReader<CrawlDatum>(CrawlDatum.class, crawldbFile);
        while (reader.hasNext()) {
            CrawlDatum datum = reader.readNext();
            addToRedis(datum);
        }

        if (fetchFile.exists()) {
            reader = new DbReader<CrawlDatum>(CrawlDatum.class, fetchFile);
            while (reader.hasNext()) {
                CrawlDatum datum = reader.readNext();
                addToRedis(datum);
            }
        }

        reader.close();
        if (parseFile.exists()) {
            DbReader<ParseData> parseReader = new DbReader<ParseData>(ParseData.class, parseFile);
            while (parseReader.hasNext()) {
                ParseData parseData = parseReader.readNext();
                if (parseData.getLinks() == null) {
                    continue;
                }
                for (Link link : parseData.getLinks()) {
                    CrawlDatum datum = new CrawlDatum();
                    datum.setUrl(link.getUrl());
                    datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
                    datum.setFetchTime(CrawlDatum.FETCHTIME_UNDEFINED);
                    addToRedis(datum);
                }
            }
            parseReader.close();

        }

        DbWriter<CrawlDatum> writer = new DbWriter<CrawlDatum>(CrawlDatum.class, new File(getCrawlPath(), Config.current_info_path));

        Set set = jedis.hkeys(getCrawlPath());

        Iterator ite = set.iterator();
        while (ite.hasNext()) {
            String key = ite.next().toString();
            String value = jedis.hget(getCrawlPath(), key);
            int status = Integer.valueOf(value.charAt(0) + "");
            long fetchTime = Long.valueOf(value.substring(1));

            CrawlDatum datum = new CrawlDatum();
            datum.setUrl(key);
            datum.setStatus(status);
            datum.setFetchTime(fetchTime);
            writer.write(datum);
            if (writeCount.incrementAndGet() % 5000 == 0) {
                LogUtils.getLogger().info(writeCount.get() + " crawlDatum write to crawldb");
            }
            //LogUtils.getLogger().info("write "+datum.getUrl());
View Full Code Here

        //RedisHelper helper=new RedisHelper("test", "127.0.0.1",6379);
        //for(int i=0;i<100;i++)
        //    helper.inject("http://www.baidu.com"+i, false);
//helper.deleteTable();
        RedisGenerator g=new RedisGenerator("test", "127.0.0.1",6379);
        CrawlDatum d=null;
        while((d=g.next())!=null){
            System.out.println(d.getUrl());
        }
    }
View Full Code Here

        if(value==null)
            return null;
        int status = Integer.valueOf(value.charAt(0) + "");
        long fetchTime = Long.valueOf(value.substring(1));

        CrawlDatum datum = new CrawlDatum();
        datum.setUrl(key);
        datum.setStatus(status);
        datum.setFetchTime(fetchTime);
        return datum;
    }
View Full Code Here

        if (parseData == null || parseData.getLinks() == null) {
            return;
        }
        for (Link link : parseData.getLinks()) {
            if (jedis.hget(tableName+suffix_parse, link.getUrl()) == null) {
                CrawlDatum datum = new CrawlDatum();
                datum.setUrl(link.getUrl());
                datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
                datum.setFetchTime(CrawlDatum.FETCHTIME_UNDEFINED);
                jedis.hset(tableName+suffix_parse, datum.getUrl(), datum.getStatus() + "" + datum.getFetchTime());

            }
        }
    }
View Full Code Here

TOP

Related Classes of cn.edu.hfut.dmic.webcollector.model.CrawlDatum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.