Examples of URLFilters


Examples of org.apache.nutch.net.URLFilters

      maxCount = job.getInt(GENERATOR_MAX_COUNT, -1);
      if (maxCount==-1){
        byDomain = false;
      }
      if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) byDomain = true;
      filters = new URLFilters(job);
      normalise = job.getBoolean(GENERATOR_NORMALISE, true);
      if (normalise) normalizers = new URLNormalizers(job,
          URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
      scfilters = new ScoringFilters(job);
      partitioner.configure(job);
View Full Code Here

Examples of org.apache.nutch.net.URLFilters

 
  public void setConf(Configuration conf) {
    super.setConf(conf);
    if (conf == null) return;
    if (conf.getBoolean("segment.merger.filter", false))
      filters = new URLFilters(conf);
    sliceSize = conf.getLong("segment.merger.slice", -1);
    if ((sliceSize > 0) && (LOG.isInfoEnabled())) {
      LOG.info("Slice size: " + sliceSize + " URLs.");
    }
  }
View Full Code Here

Examples of org.apache.nutch.net.URLFilters

    public void configure(JobConf job) {
      curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis());
      limit = job.getLong("crawl.topN",Long.MAX_VALUE)/job.getNumReduceTasks();
      maxPerHost = job.getInt("generate.max.per.host", -1);
      byIP = job.getBoolean("generate.max.per.host.by.ip", false);
      filters = new URLFilters(job);
      scfilters = new ScoringFilters(job);
    }
View Full Code Here

Examples of org.apache.nutch.net.URLFilters

   
    public void configure(JobConf job) {
      super.configure(job);
      _maxInlinks = job.getInt("db.max.inlinks", 10000);
      if (job.getBoolean("linkdb.merger.urlfilters", false)) {
        filters = new URLFilters(job);
      }
    }
View Full Code Here

Examples of org.apache.nutch.net.URLFilters

    public void close() throws IOException {}

    public void configure(JobConf conf) {
      if (conf.getBoolean("crawldb.merger.urlfilters", false))
        filters = new URLFilters(conf);
    }
View Full Code Here

Examples of org.apache.nutch.net.URLFilters

 
  public void setConf(Configuration conf) {
    super.setConf(conf);
    if (conf == null) return;
    if (conf.getBoolean("segment.merger.filter", false)) {
      filters = new URLFilters(conf);
      mergeFilters = new SegmentMergeFilters(conf);
    }
    if (conf.getBoolean("segment.merger.normalizer", false))
      normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
    sliceSize = conf.getLong("segment.merger.slice", -1);
View Full Code Here

Examples of org.apache.nutch.net.URLFilters

   */
  public void setConf(Configuration conf) {
    this.conf = conf;
    this.parserFactory = new ParserFactory(conf);
    this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
    this.filters = new URLFilters(conf);
    this.defaultEncoding =
      conf.get("parser.character.encoding.default", "windows-1252");
  }
View Full Code Here

Examples of org.apache.nutch.net.URLFilters

 
  public void configure(JobConf job) {
    filter = job.getBoolean(URL_FILTERING, false);
    normalize = job.getBoolean(URL_NORMALIZING, false);
    if (filter) {
      filters = new URLFilters(job);
    }
    if (normalize) {
      scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_LINKDB);
      normalizers = new URLNormalizers(job, scope);
    }
View Full Code Here

Examples of org.apache.nutch.net.URLFilters

  public void configure(JobConf job) {
    urlFiltering = job.getBoolean(URL_FILTERING, false);
    urlNormalizers = job.getBoolean(URL_NORMALIZING, false);
    if (urlFiltering) {
      filters = new URLFilters(job);
    }
    if (urlNormalizers) {
      scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
      normalizers = new URLNormalizers(job, scope);
    }
View Full Code Here

Examples of org.apache.nutch.net.URLFilters

 
  public void configure(JobConf job) {
    maxAnchorLength = job.getInt("db.max.anchor.length", 100);
    ignoreInternalLinks = job.getBoolean("db.ignore.internal.links", true);
    if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
      urlFilters = new URLFilters(job);
    }
    if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) {
      urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
    }
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.