Examples of ScoringFilters


Examples of org.apache.nutch.scoring.ScoringFilters

  public RecordWriter getRecordWriter(FileSystem fs, JobConf job,
                                      String name, Progressable progress) throws IOException {

    this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
    this.filters = new URLFilters(job);
    this.scfilters = new ScoringFilters(job);
    final float interval = job.getFloat("db.default.fetch.interval", 30f);
    final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
   
    Path text =
      new Path(new Path(job.getOutputPath(), ParseText.DIR_NAME), name);
View Full Code Here

Examples of org.apache.nutch.scoring.ScoringFilters

    super(conf);
  }

  public void configure(JobConf job) {
    setConf(job);
    this.scfilters = new ScoringFilters(job);
  }
View Full Code Here

Examples of org.apache.nutch.scoring.ScoringFilters

    private Text url = new Text();

    @Override
    public void configure(JobConf job) {
      super.configure(job);
      scfilters = new ScoringFilters(job);
      if (job.getBoolean(FILTER_KEY, false)) {
        filters = new URLFilters(job);
      }
      if (job.getBoolean(NORMALIZE_KEY, false)) {
        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
View Full Code Here

Examples of org.apache.nutch.scoring.ScoringFilters

  private ScoringFilters scfilters = null;
  private boolean additionsAllowed;

  public void configure(JobConf job) {
    retryMax = job.getInt("db.fetch.retry.max", 3);
    scfilters = new ScoringFilters(job);
    additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
  }
View Full Code Here

Examples of org.apache.nutch.scoring.ScoringFilters

    public FetcherThread(Configuration conf) {
      this.setDaemon(true);                       // don't hang JVM on exit
      this.setName("FetcherThread");              // use an informative name
      this.conf = conf;
      this.urlFilters = new URLFilters(conf);
      this.scfilters = new ScoringFilters(conf);
      this.parseUtil = new ParseUtil(conf);
      this.protocolFactory = new ProtocolFactory(conf);
      this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
    }
View Full Code Here

Examples of org.apache.nutch.scoring.ScoringFilters

      limit = job.getLong(CRAWL_TOP_N,Long.MAX_VALUE)/job.getNumReduceTasks();
      maxPerHost = job.getInt(GENERATE_MAX_PER_HOST, -1);
      byIP = job.getBoolean(GENERATE_MAX_PER_HOST_BY_IP, false);
      filters = new URLFilters(job);
      normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
      scfilters = new ScoringFilters(job);
      hostPartitioner.configure(job);
      filter = job.getBoolean(CRAWL_GENERATE_FILTER, true);
      genDelay = job.getLong(CRAWL_GEN_DELAY, 7L) * 3600L * 24L * 1000L;
      long time = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
      if (time > 0) genTime.set(time);
View Full Code Here

Examples of org.apache.nutch.scoring.ScoringFilters

  }
 
  public void configure(JobConf job) {
    setConf(job);
    this.filters = new IndexingFilters(getConf());
    this.scfilters = new ScoringFilters(getConf());
  }
View Full Code Here

Examples of org.apache.nutch.scoring.ScoringFilters

    public FetcherThread(Configuration conf) {
      this.setDaemon(true);                       // don't hang JVM on exit
      this.setName("FetcherThread");              // use an informative name
      this.conf = conf;
      this.urlFilters = new URLFilters(conf);
      this.scfilters = new ScoringFilters(conf);
      this.parseUtil = new ParseUtil(conf);
      this.protocolFactory = new ProtocolFactory(conf);
      this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
      this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
      // backward-compatible default setting
View Full Code Here

Examples of org.apache.nutch.scoring.ScoringFilters

    public void configure(JobConf job) {
      this.jobConf = job;
      urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
      interval = jobConf.getFloat("db.default.fetch.interval", 30f);
      filters = new URLFilters(jobConf);
      scfilters = new ScoringFilters(jobConf);
      scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
      curTime = job.getLong("injector.current.time", System.currentTimeMillis());
    }
View Full Code Here

Examples of org.apache.nutch.scoring.ScoringFilters

  private CrawlDatum fetch = new CrawlDatum();
  private CrawlDatum old = new CrawlDatum();

  public void configure(JobConf job) {
    retryMax = job.getInt("db.fetch.retry.max", 3);
    scfilters = new ScoringFilters(job);
    additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
    int oldMaxInterval = job.getInt("db.max.fetch.interval", 0);
    maxInterval = job.getInt("db.fetch.interval.max", 0 );
    if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
    schedule = FetchScheduleFactory.getFetchSchedule(job);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.