Package cn.edu.hfut.dmic.webcollector.model

Examples of cn.edu.hfut.dmic.webcollector.model.CrawlDatum


                    }
                    continue;
                }
                while (feed > 0 && hasMore) {

                    CrawlDatum datum = generator.next();
                    hasMore = (datum != null);

                    if (hasMore) {
                        queue.addFetchItem(new FetchItem(datum));
                        feed--;
View Full Code Here


                            }
                        }

                        lastRequestStart.set(System.currentTimeMillis());

                        CrawlDatum crawldatum = new CrawlDatum();
                        String url = item.datum.getUrl();
                        crawldatum.setUrl(url);

                        Request request = requestFactory.createRequest(url);
                        Response response = null;

                        for (int i = 0; i <= retry; i++) {
                            if (i > 0) {
                                LogUtils.getLogger().info("retry " + i + "th " + url);
                            }
                            try {
                                response = request.getResponse(crawldatum);
                                break;
                            } catch (Exception ex) {

                            }
                        }

                        crawldatum.setStatus(CrawlDatum.STATUS_DB_FETCHED);
                        crawldatum.setFetchTime(System.currentTimeMillis());

                        Page page = new Page();
                        page.setUrl(url);
                        page.setFetchTime(crawldatum.getFetchTime());

                        if (response == null) {
                            LogUtils.getLogger().info("failed " + url);
                            HandlerUtils.sendMessage(handler, new Message(Fetcher.FETCH_FAILED, page), true);
                            continue;
View Full Code Here

        if(inject_file.exists())
            writer=new DbWriter<CrawlDatum>(CrawlDatum.class,inject_file,append);
        else
            writer=new DbWriter<CrawlDatum>(CrawlDatum.class,inject_file,false);
        for(String url:urls){
            CrawlDatum crawldatum=new CrawlDatum();
            crawldatum.setUrl(url);
            crawldatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
            writer.write(crawldatum)
            LogUtils.getLogger().info("inject "+url);
        }
        writer.close();
       
View Full Code Here

        iterator=data.iterator();
    }
   
    public void addUrls(Collection<String> urls){
        for(String url:urls){
            CrawlDatum crawldatum=new CrawlDatum();
            crawldatum.setUrl(url);
            crawldatum.setFetchTime(CrawlDatum.FETCHTIME_UNDEFINED);
            crawldatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
            data.add(crawldatum);
        }
        iterator=data.iterator();
    }
View Full Code Here

        iterator=data.iterator();
    }
   
    public void addUrl(String url){
      
        CrawlDatum crawldatum=new CrawlDatum();
        crawldatum.setUrl(url);
        crawldatum.setFetchTime(CrawlDatum.FETCHTIME_UNDEFINED);
        crawldatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
        data.add(crawldatum);
        iterator=data.iterator();
    }
View Full Code Here

        if (!dbreader.hasNext()) {
            return null;
        }

        CrawlDatum crawldatum = dbreader.readNext();

        if (crawldatum == null) {
            return null;
        }

        if (shouldFilter(crawldatum.getUrl())) {
            return next();
        }
        return crawldatum;
    }
View Full Code Here

        int sum=0;
        int sum_fetched=0;
        int sum_unfetched=0;
       
       
        CrawlDatum crawldatum=null;

        System.out.println("start read:");
        while(reader.hasNext()){
            crawldatum=reader.readNext();
            System.out.println(crawldatum.getUrl());
            sum++;
            switch(crawldatum.getStatus()){
                case CrawlDatum.STATUS_DB_FETCHED:
                    sum_fetched++;
                    break;
                case CrawlDatum.STATUS_DB_UNFETCHED:
                    sum_unfetched++;
View Full Code Here

        DbReader<CrawlDatum> reader_fetch = new DbReader<CrawlDatum>(CrawlDatum.class, file_fetch);

        HashMap<String, Integer> indexmap = new HashMap<String, Integer>();

        ArrayList<CrawlDatum> datums_origin = new ArrayList<CrawlDatum>();
        CrawlDatum datum = null;
        while (reader_current.hasNext()) {
            datum = reader_current.readNext();
            datums_origin.add(datum);
            indexmap.put(datum.getUrl(), datums_origin.size() - 1);
        }

        while (reader_fetch.hasNext()) {
            datum = reader_fetch.readNext();
            if (indexmap.containsKey(datum.getUrl())) {
                if (datum.getStatus() == CrawlDatum.STATUS_DB_UNFETCHED) {
                    continue;
                } else {
                    int preindex = indexmap.get(datum.getUrl());
                    datums_origin.set(preindex, datum);
                    indexmap.put(datum.getUrl(), preindex);
                }

            } else {
                datums_origin.add(datum);
                indexmap.put(datum.getUrl(), datums_origin.size() - 1);
            }

        }
        reader_fetch.close();

        File file_parse = new File(getSegmentPath(), "parse_data/info.avro");
        if (file_parse.exists()) {
            DbReader<ParseData> reader_parse = new DbReader<ParseData>(ParseData.class, file_parse);
            ParseData parseresult = null;
            while (reader_parse.hasNext()) {
                parseresult = reader_parse.readNext();
                for (Link link : parseresult.getLinks()) {
                    datum = new CrawlDatum();
                    datum.setUrl(link.getUrl());
                    datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
                    if (indexmap.containsKey(datum.getUrl())) {
                        continue;
                    } else {
                        datums_origin.add(datum);
                        indexmap.put(datum.getUrl(), datums_origin.size() - 1);
                    }
                }
            }
            reader_parse.close();
        }
View Full Code Here

     * @return 下一个达到时间间隔要求的爬取任务,如果没有符合规则的任务,返回null
     */
    @Override
    public CrawlDatum next() {
        while(true){
        CrawlDatum crawldatum=generator.next();
       
         if(crawldatum==null){
            return null;
        }
        
       
        if(crawldatum.getStatus()==CrawlDatum.STATUS_DB_UNFETCHED){
            return crawldatum;
        }
        if(Config.interval==-1){
            continue;
        }
      
        Long lasttime=crawldatum.getFetchTime();
        if(lasttime+Config.interval>System.currentTimeMillis()){
            continue;
        }
        return crawldatum;
        }
View Full Code Here

     */

    @Override
    public CrawlDatum next() {
        while(true){
        CrawlDatum crawldatum=generator.next();
        if(crawldatum==null){
            return null;
        }
        String url=crawldatum.getUrl();
        if(hashset.contains(url)){
            continue;
        }
        else{
            addUrl(url);
View Full Code Here

TOP

Related Classes of cn.edu.hfut.dmic.webcollector.model.CrawlDatum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.