Examples of org.apache.nutch.scoring.ScoringFilters

org.apache.nutch.scoring.ScoringFilters
Creates and caches {@link ScoringFilter} implementing plugins. @author Andrzej Bialecki

  private int maxInterval;
  private FetchSchedule schedule;


  public void configure(JobConf job) {
    retryMax = job.getInt("db.fetch.retry.max", 3);
    scfilters = new ScoringFilters(job);
    additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
    int oldMaxInterval = job.getInt("db.max.fetch.interval", 0);
    maxInterval = job.getInt("db.fetch.interval.max", 0 );
    if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
    schedule = FetchScheduleFactory.getFetchSchedule(job);

View Full Code Here

  private URLFilters urlFilters;


  public void configure(JobConf job) {
    setConf(job);
    this.filters = new IndexingFilters(getConf());
    this.scfilters = new ScoringFilters(getConf());
    this.delete = job.getBoolean(INDEXER_DELETE, false);
    this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX, false);
    this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);


    normalize = job.getBoolean(URL_NORMALIZING, false);

View Full Code Here

  public void testCrawlDbStatTransitionInject() {
    LOG.info("Test CrawlDatum states in Injector after inject");
    Configuration conf = CrawlDBTestUtil.createConfiguration();
    CrawlDbUpdateUtil<Injector.InjectReducer> inject = new CrawlDbUpdateUtil<Injector.InjectReducer>(
        new Injector.InjectReducer(), conf);
    ScoringFilters scfilters = new ScoringFilters(conf);
    for (String sched : schedules) {
      LOG.info("Testing inject with " + sched);
      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl."+sched);
      FetchSchedule schedule = FetchScheduleFactory
          .getFetchSchedule(new JobConf(conf));
      List<CrawlDatum> values = new ArrayList<CrawlDatum>();
      for (int i = 0; i < fetchDbStatusPairs.length; i++) {
        byte fromDbStatus = fetchDbStatusPairs[i][1];
        byte toDbStatus = fromDbStatus;
        if (fromDbStatus == -1) {
          toDbStatus = STATUS_DB_UNFETCHED;
        } else {
          CrawlDatum fromDb = new CrawlDatum();
          fromDb.setStatus(fromDbStatus);
          schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
          values.add(fromDb);
        }
        LOG.info("inject "
            + (fromDbStatus == -1 ? "<not in CrawlDb>" : CrawlDatum
                .getStatusName(fromDbStatus)) + " + "
            + getStatusName(STATUS_INJECTED) + " => "
            + getStatusName(toDbStatus));
        CrawlDatum injected = new CrawlDatum(STATUS_INJECTED,
            conf.getInt("db.fetch.interval.default", 2592000), 0.1f);
        schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, injected);
        try {
          scfilters.injectedScore(CrawlDbUpdateUtil.dummyURL, injected);
        } catch (ScoringFilterException e) {
          LOG.error(StringUtils.stringifyException(e));
        }
        values.add(injected);
        List<CrawlDatum> res = inject.update(values);

View Full Code Here

    public FetcherThread(Configuration conf) {
      this.setDaemon(true);                       // don't hang JVM on exit
      this.setName("FetcherThread");              // use an informative name
      this.conf = conf;
      this.urlFilters = new URLFilters(conf);
      this.scfilters = new ScoringFilters(conf);
      this.parseUtil = new ParseUtil(conf);
      this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
      this.protocolFactory = new ProtocolFactory(conf);
      this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
      this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;

View Full Code Here

    public FetcherThread(Configuration conf) {
      this.setDaemon(true);                       // don't hang JVM on exit
      this.setName("FetcherThread");              // use an informative name
      this.conf = conf;
      this.urlFilters = new URLFilters(conf);
      this.scfilters = new ScoringFilters(conf);
      this.parseUtil = new ParseUtil(conf);
      this.protocolFactory = new ProtocolFactory(conf);
      this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
    }

View Full Code Here

    public void configure(JobConf job) {
      this.jobConf = job;
      urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
      interval = jobConf.getInt("db.fetch.interval.default", 2592000);
      filters = new URLFilters(jobConf);
      scfilters = new ScoringFilters(jobConf);
      scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
      curTime = job
          .getLong("injector.current.time", System.currentTimeMillis());
    }

View Full Code Here


    if(job.getBoolean("parse.normalize.urls", true)) {
      normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
    }


    this.scfilters = new ScoringFilters(job);
    final int interval = job.getInt("db.fetch.interval.default", 2592000);
    final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
    int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
    final boolean isParsing = job.getBoolean("fetcher.parse", true);
    final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE

View Full Code Here

      if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) byDomain = true;
      filters = new URLFilters(job);
      normalise = job.getBoolean(GENERATOR_NORMALISE, true);
      if (normalise) normalizers = new URLNormalizers(job,
          URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
      scfilters = new ScoringFilters(job);
      partitioner.configure(job);
      filter = job.getBoolean(GENERATOR_FILTER, true);
      genDelay = job.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L;
      long time = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
      if (time > 0) genTime.set(time);

View Full Code Here

    super(conf);
  }


  public void configure(JobConf job) {
    setConf(job);
    this.scfilters = new ScoringFilters(job);
    skipTruncated=job.getBoolean(SKIP_TRUNCATED, true);
  }

View Full Code Here


    if (ParseSegment.isTruncated(content)) {
      LOG.warn("Content is truncated, parse may fail!");
    }


    ScoringFilters scfilters = new ScoringFilters(conf);
    // call the scoring filters
    try {
      scfilters.passScoreBeforeParsing(turl, cd, content);
    } catch (Exception e) {
      if (LOG.isWarnEnabled()) {
        LOG.warn("Couldn't pass score, url " + turl.toString() + " (" + e + ")");
      }
    }    
    
    ParseResult parseResult = new ParseUtil(conf).parse(content);


    if (parseResult == null) {
      LOG.error("Problem with parse - check log");
      return (-1);
    }


    // Calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new Text(url)));
    
    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
      LOG.info("signature: " + StringUtil.toHexString(signature));
    }


    // call the scoring filters
    try {
      scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
    } catch (Exception e) {
      if (LOG.isWarnEnabled()) {
        LOG.warn("Couldn't pass score, url " + turl + " (" + e + ")");
      }
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.scoring.ScoringFilters

org.apache.nutch.crawl.CrawlDbReducer

org.apache.nutch.crawl.DbUpdateMapper

org.apache.nutch.crawl.DbUpdateReducer

org.apache.nutch.crawl.DbUpdaterJob

org.apache.nutch.crawl.Generator$Selector

org.apache.nutch.crawl.GeneratorMapper

org.apache.nutch.crawl.Injector$InjectMapper

org.apache.nutch.crawl.InjectorJob$UrlMapper

org.apache.nutch.crawl.TestCrawlDbStates

org.apache.nutch.fetcher.Fetcher$FetcherThread

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.