Package org.apache.nutch.scoring

Examples of org.apache.nutch.scoring.ScoringFilters


    currentJob = new NutchJob(getConf(), "update-table");
    if (crawlId != null) {
      currentJob.getConfiguration().set(Nutch.CRAWL_ID_KEY, crawlId);
    }
    //job.setBoolean(ALL, updateAll);
    ScoringFilters scoringFilters = new ScoringFilters(getConf());
    HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
    fields.addAll(scoringFilters.getFields());
   
    // Partition by {url}, sort by {url,score} and group by {url}.
    // This ensures that the inlinks are sorted by score when they enter
    // the reducer.
   
View Full Code Here


      urlNormalizers = new URLNormalizers(context.getConfiguration(),
          URLNormalizers.SCOPE_INJECT);
      interval = context.getConfiguration().getInt("db.fetch.interval.default",
          2592000);
      filters = new URLFilters(context.getConfiguration());
      scfilters = new ScoringFilters(context.getConfiguration());
      scoreInjected = context.getConfiguration().getFloat("db.score.injected",
          1.0f);
      curTime = context.getConfiguration().getLong("injector.current.time",
          System.currentTimeMillis());
    }
View Full Code Here

    normalizers =
      new URLNormalizers(conf, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
    filter = conf.getBoolean(GeneratorJob.GENERATOR_FILTER, true);
    normalise = conf.getBoolean(GeneratorJob.GENERATOR_NORMALISE, true);
    schedule = FetchScheduleFactory.getFetchSchedule(conf);
    scoringFilters = new ScoringFilters(conf);
  }
View Full Code Here

    Configuration conf = context.getConfiguration();
    retryMax = conf.getInt("db.fetch.retry.max", 3);
    additionsAllowed = conf.getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
    maxInterval = conf.getInt("db.fetch.interval.max", 0 );
    schedule = FetchScheduleFactory.getFetchSchedule(conf);
    scoringFilters = new ScoringFilters(conf);
    maxLinks = conf.getInt("db.update.max.inlinks", 10000);
  }
View Full Code Here

    }
  }

  @Override
  public void setup(Context context) {
    scoringFilters = new ScoringFilters(context.getConfiguration());
    pageWritable = new WebPageWritable(context.getConfiguration(), null);
  }
View Full Code Here

    public FetcherThread(Configuration conf) {
      this.setDaemon(true);                       // don't hang JVM on exit
      this.setName("FetcherThread");              // use an informative name
      this.conf = conf;
      this.urlFilters = new URLFilters(conf);
      this.scfilters = new ScoringFilters(conf);
      this.parseUtil = new ParseUtil(conf);
      this.protocolFactory = new ProtocolFactory(conf);
      this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
      this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
      this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", false);
View Full Code Here

      // signature and link crawldatums.

      this.urlNormalizers =
        new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
      this.filters = new URLFilters(job);
      this.scfilters = new ScoringFilters(job);

      final float interval =
        job.getFloat("db.default.fetch.interval", 30f);
      final boolean ignoreExternalLinks =
        job.getBoolean("db.ignore.external.links", false);
View Full Code Here

    public void configure(JobConf job) {
      this.jobConf = job;
      urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
      interval = jobConf.getFloat("db.default.fetch.interval", 30f);
      filters = new URLFilters(jobConf);
      scfilters = new ScoringFilters(jobConf);
      scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
      curTime = job.getLong("injector.current.time", System.currentTimeMillis());
    }
View Full Code Here

  }
 
  public void configure(JobConf job) {
    setConf(job);
    this.filters = new IndexingFilters(getConf());
    this.scfilters = new ScoringFilters(getConf());   
    this.collectionType = job.get(Global.COLLECTION_TYPE);  
  }
View Full Code Here

    @Override
    public void configure(JobConf job) {
      super.configure(job);
      defaultInterval = job.getInt("db.fetch.interval.default", 0);
      scfilters = new ScoringFilters(job);
      if (job.getBoolean(FILTER_KEY, false)) {
        filters = new URLFilters(job);
      }
      if (job.getBoolean(NORMALIZE_KEY, false)) {
        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
View Full Code Here

TOP

Related Classes of org.apache.nutch.scoring.ScoringFilters

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.