Package org.apache.nutch.net

Examples of org.apache.nutch.net.URLFilters


   */
  public void setConf(Configuration conf) {
    this.conf = conf;
    this.parserFactory = new ParserFactory(conf);
    this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
    this.filters = new URLFilters(conf);
    this.defaultEncoding =
      conf.get("parser.character.encoding.default", "windows-1252");
  }
View Full Code Here


    protected void setup(Context context) throws IOException, InterruptedException {
      urlNormalizers = new URLNormalizers(context.getConfiguration(),
        URLNormalizers.SCOPE_INJECT);
      interval = context.getConfiguration().getInt("db.fetch.interval.default",
        2592000);
      filters = new URLFilters(context.getConfiguration());
      scfilters = new ScoringFilters(context.getConfiguration());
      scoreInjected = context.getConfiguration().getFloat("db.score.injected",
        1.0f);
      curTime = context.getConfiguration().getLong("injector.current.time",
        System.currentTimeMillis());
View Full Code Here

      maxCount = job.getInt(GENERATOR_MAX_COUNT, -1);
      if (maxCount==-1){
        byDomain = false;
      }
      if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) byDomain = true;
      filters = new URLFilters(job);
      normalise = job.getBoolean(GENERATOR_NORMALISE, true);
      if (normalise) normalizers = new URLNormalizers(job,
          URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
      scfilters = new ScoringFilters(job);
      partitioner.configure(job);
View Full Code Here

      if (normalize) {
        urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
      }

      if (filter) {
        filters = new URLFilters(conf);
      }
    }
View Full Code Here

TOP

Related Classes of org.apache.nutch.net.URLFilters

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.