Source Code of org.apache.nutch.fetcher.OldFetcher$FetcherThread

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.nutch.fetcher;


import java.io.IOException;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.Map.Entry;


// Slf4j Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;


import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.*;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.*;




/** The fetcher. Most of the work is done by plugins. */
public class OldFetcher extends Configured implements Tool, MapRunnable<WritableComparable, Writable, Text, NutchWritable> { 


  public static final Logger LOG = LoggerFactory.getLogger(OldFetcher.class);
  
  public static final int PERM_REFRESH_TIME = 5;


  public static final String CONTENT_REDIR = "content";


  public static final String PROTOCOL_REDIR = "protocol";


  public static class InputFormat extends SequenceFileInputFormat<WritableComparable, Writable> {
    /** Don't split inputs, to keep things polite. */
    public InputSplit[] getSplits(JobConf job, int nSplits)
      throws IOException {
      FileStatus[] files = listStatus(job);
      FileSystem fs = FileSystem.get(job);
      InputSplit[] splits = new InputSplit[files.length];
      for (int i = 0; i < files.length; i++) {
        FileStatus cur = files[i];
        splits[i] = new FileSplit(cur.getPath(), 0,
            cur.getLen(), (String[])null);
      }
      return splits;
    }
  }


  private RecordReader<WritableComparable, Writable> input;
  private OutputCollector<Text, NutchWritable> output;
  private Reporter reporter;


  private String segmentName;
  private int activeThreads;
  private int maxRedirect;


  private long start = System.currentTimeMillis(); // start time of fetcher run
  private long lastRequestStart = start;


  private long bytes;                             // total bytes fetched
  private int pages;                              // total pages fetched
  private int errors;                             // total pages errored


  private boolean storingContent;
  private boolean parsing;


  private class FetcherThread extends Thread {
    private Configuration conf;
    private URLFilters urlFilters;
    private ScoringFilters scfilters;
    private ParseUtil parseUtil;
    private URLNormalizers normalizers;
    private ProtocolFactory protocolFactory;
    private boolean redirecting;
    private int redirectCount;
    private String reprUrl;


    public FetcherThread(Configuration conf) {
      this.setDaemon(true);                       // don't hang JVM on exit
      this.setName("FetcherThread");              // use an informative name
      this.conf = conf;
      this.urlFilters = new URLFilters(conf);
      this.scfilters = new ScoringFilters(conf);
      this.parseUtil = new ParseUtil(conf);
      this.protocolFactory = new ProtocolFactory(conf);
      this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
    }


    public void run() {
      synchronized (OldFetcher.this) {activeThreads++;} // count threads
      
      try {
        Text key = new Text();
        CrawlDatum datum = new CrawlDatum();
        
        while (true) {
          // TODO : NUTCH-258 ...
          // If something bad happened, then exit
          // if (conf.getBoolean("fetcher.exit", false)) {
          //   break;
          // ]
          
          try {                                   // get next entry from input
            if (!input.next(key, datum)) {
              break;                              // at eof, exit
            }
          } catch (IOException e) {
            if (LOG.isErrorEnabled()) {
              LOG.error("fetcher caught:"+e.toString());
            }
            break;
          }


          synchronized (OldFetcher.this) {
            lastRequestStart = System.currentTimeMillis();
          }


          // url may be changed through redirects.
          Text url = new Text(key);


          Text reprUrlWritable =
            (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
          if (reprUrlWritable == null) {
            reprUrl = key.toString();
          } else {
            reprUrl = reprUrlWritable.toString();
          }


          try {
            if (LOG.isInfoEnabled()) { LOG.info("fetching " + url); }


            // fetch the page
            redirectCount = 0;
            do {
              if (LOG.isDebugEnabled()) {
                LOG.debug("redirectCount=" + redirectCount);
              }
              redirecting = false;
              Protocol protocol = this.protocolFactory.getProtocol(url.toString());
              ProtocolOutput output = protocol.getProtocolOutput(url, datum);
              ProtocolStatus status = output.getStatus();
              Content content = output.getContent();
              ParseStatus pstatus = null;


              String urlString = url.toString();
              if (reprUrl != null && !reprUrl.equals(urlString)) {
                datum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
                    new Text(reprUrl));
              }


              switch(status.getCode()) {


              case ProtocolStatus.SUCCESS:        // got a page
                pstatus = output(url, datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS);
                updateStatus(content.getContent().length);
                if (pstatus != null && pstatus.isSuccess() &&
                        pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                  String newUrl = pstatus.getMessage();
                  int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                  url = handleRedirect(url, datum, urlString, newUrl,
                                       refreshTime < PERM_REFRESH_TIME,
                                       CONTENT_REDIR);
                }
                break;


              case ProtocolStatus.MOVED:         // redirect
              case ProtocolStatus.TEMP_MOVED:
                int code;
                boolean temp;
                if (status.getCode() == ProtocolStatus.MOVED) {
                  code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                  temp = false;
                } else {
                  code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                  temp = true;
                }
                output(url, datum, content, status, code);
                String newUrl = status.getMessage();
                url = handleRedirect(url, datum, urlString, newUrl,
                                     temp, PROTOCOL_REDIR);
                break;


              // failures - increase the retry counter
              case ProtocolStatus.EXCEPTION:
                logError(url, status.getMessage());
              /* FALLTHROUGH */
              case ProtocolStatus.RETRY:          // retry
              case ProtocolStatus.WOULDBLOCK:
              case ProtocolStatus.BLOCKED:
                output(url, datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                break;
                
              // permanent failures
              case ProtocolStatus.GONE:           // gone
              case ProtocolStatus.NOTFOUND:
              case ProtocolStatus.ACCESS_DENIED:
              case ProtocolStatus.ROBOTS_DENIED:
                output(url, datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
                break;


              case ProtocolStatus.NOTMODIFIED:
                output(url, datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
                break;
                
              default:
                if (LOG.isWarnEnabled()) {
                  LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                }
                output(url, datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
              }


              if (redirecting && redirectCount >= maxRedirect) {
                if (LOG.isInfoEnabled()) {
                  LOG.info(" - redirect count exceeded " + url);
                }
                output(url, datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
              }


            } while (redirecting && (redirectCount < maxRedirect));


            
          } catch (Throwable t) {                 // unexpected exception
            logError(url, t.toString());
            output(url, datum, null, null, CrawlDatum.STATUS_FETCH_RETRY);
            
          }
        }


      } catch (Throwable e) {
        if (LOG.isErrorEnabled()) {
          LOG.error("fetcher caught:"+e.toString());
        }
      } finally {
        synchronized (OldFetcher.this) {activeThreads--;} // count threads
      }
    }


    private Text handleRedirect(Text url, CrawlDatum datum,
                                String urlString, String newUrl,
                                boolean temp, String redirType)
    throws MalformedURLException, URLFilterException {
      newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
      newUrl = urlFilters.filter(newUrl);
      if (newUrl != null && !newUrl.equals(urlString)) {
        reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
        url = new Text(newUrl);
        if (maxRedirect > 0) {
          redirecting = true;
          redirectCount++;
          if (LOG.isDebugEnabled()) {
            LOG.debug(" - " + redirType + " redirect to " +
                      url + " (fetching now)");
          }
          return url;
        } else {
          CrawlDatum newDatum = new CrawlDatum();
          if (reprUrl != null) {
            newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
                new Text(reprUrl));
          }
          output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
          if (LOG.isDebugEnabled()) {
            LOG.debug(" - " + redirType + " redirect to " +
                      url + " (fetching later)");
          }
          return null;
        }
      } else {
        if (LOG.isDebugEnabled()) {
          LOG.debug(" - " + redirType + " redirect skipped: " +
              (newUrl != null ? "to same url" : "filtered"));
        }
        return null;
      }
    }


    private void logError(Text url, String message) {
      if (LOG.isInfoEnabled()) {
        LOG.info("fetch of " + url + " failed with: " + message);
      }
      synchronized (OldFetcher.this) {               // record failure
        errors++;
      }
    }


    private ParseStatus output(Text key, CrawlDatum datum,
                        Content content, ProtocolStatus pstatus, int status) {


      datum.setStatus(status);
      datum.setFetchTime(System.currentTimeMillis());
      if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);


      ParseResult parseResult = null;
      if (content != null) {
        Metadata metadata = content.getMetadata();
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
          scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
          }
        }
        /* Note: Fetcher will only follow meta-redirects coming from the
         * original URL. */ 
        if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
          try {
            parseResult = this.parseUtil.parse(content);
          } catch (Exception e) {
            LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
          }


          if (parseResult == null) {
            byte[] signature = 
              SignatureFactory.getSignature(getConf()).calculate(content, 
                  new ParseStatus().getEmptyParse(conf));
            datum.setSignature(signature);
          }
        }
        
        /* Store status code in content So we can read this value during 
         * parsing (as a separate job) and decide to parse or not.
         */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
      }


      try {
        output.collect(key, new NutchWritable(datum));
        if (content != null && storingContent)
          output.collect(key, new NutchWritable(content));
        if (parseResult != null) {
          for (Entry<Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();
            
            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }


            // Calculate page signature. For non-parsing fetchers this will
            // be done in ParseSegment
            byte[] signature = 
              SignatureFactory.getSignature(getConf()).calculate(content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
                segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
                StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
                Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            } catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(
                    new ParseImpl(new ParseText(parse.getText()), 
                                  parse.getData(), parse.isCanonical())));
          }
        }
      } catch (IOException e) {
        if (LOG.isErrorEnabled()) {
          LOG.error("fetcher caught:"+e.toString());
        }
      }


      // return parse status if it exits
      if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
          return p.getData().getStatus();
        }
      } 
      return null;
    }
    
  }


  private synchronized void updateStatus(int bytesInPage) throws IOException {
    pages++;
    bytes += bytesInPage;
  }


  private void reportStatus() throws IOException {
    String status;
    synchronized (this) {
      long elapsed = (System.currentTimeMillis() - start)/1000;
      status = 
        pages+" pages, "+errors+" errors, "
        + Math.round(((float)pages*10)/elapsed)/10.0+" pages/s, "
        + Math.round(((((float)bytes)*8)/1024)/elapsed)+" kb/s, ";
    }
    reporter.setStatus(status);
  }


  public OldFetcher() {
    
  }
  
  public OldFetcher(Configuration conf) {
    setConf(conf);
  }
  
  public void configure(JobConf job) {
    setConf(job);


    this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);
    this.storingContent = isStoringContent(job);
    this.parsing = isParsing(job);


//    if (job.getBoolean("fetcher.verbose", false)) {
//      LOG.setLevel(Level.FINE);
//    }
  }


  public void close() {}


  public static boolean isParsing(Configuration conf) {
    return conf.getBoolean("fetcher.parse", true);
  }


  public static boolean isStoringContent(Configuration conf) {
    return conf.getBoolean("fetcher.store.content", true);
  }


  public void run(RecordReader<WritableComparable, Writable> input, OutputCollector<Text, NutchWritable> output,
                  Reporter reporter) throws IOException {


    this.input = input;
    this.output = output;
    this.reporter = reporter;


    this.maxRedirect = getConf().getInt("http.redirect.max", 3);
    
    int threadCount = getConf().getInt("fetcher.threads.fetch", 10);
    if (LOG.isInfoEnabled()) { LOG.info("OldFetcher: threads: " + threadCount); }


    for (int i = 0; i < threadCount; i++) {       // spawn threads
      new FetcherThread(getConf()).start();
    }


    // select a timeout that avoids a task timeout
    long timeout = getConf().getInt("mapred.task.timeout", 10*60*1000)/2;


    do {                                          // wait for threads to exit
      try {
        Thread.sleep(1000);
      } catch (InterruptedException e) {}


      reportStatus();


      // some requests seem to hang, despite all intentions
      synchronized (this) {
        if ((System.currentTimeMillis() - lastRequestStart) > timeout) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Aborting with "+activeThreads+" hung threads.");
          }
          return;
        }
      }


    } while (activeThreads > 0);
    
  }


  public void fetch(Path segment, int threads)
    throws IOException {


    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("OldFetcher: starting at " + sdf.format(start));
      LOG.info("OldFetcher: segment: " + segment);
    }


    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);


    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());


    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);


    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);


    job.setMapRunnerClass(OldFetcher.class);


    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);


    JobClient.runJob(job);
    long end = System.currentTimeMillis();
    LOG.info("OldFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  }




  /** Run the fetcher. */
  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new OldFetcher(), args);
    System.exit(res);
  }
  
  public int run(String[] args) throws Exception {


    String usage = "Usage: OldFetcher <segment> [-threads n] [-noParsing]";


    if (args.length < 1) {
      System.err.println(usage);
      return -1;
    }
      
    Path segment = new Path(args[0]);
    int threads = getConf().getInt("fetcher.threads.fetch", 10);
    boolean parsing = true;


    for (int i = 1; i < args.length; i++) {       // parse command line
      if (args[i].equals("-threads")) {           // found -threads option
        threads =  Integer.parseInt(args[++i]);
      } else if (args[i].equals("-noParsing")) parsing = false;
    }


    getConf().setInt("fetcher.threads.fetch", threads);
    if (!parsing) {
      getConf().setBoolean("fetcher.parse", parsing);
    }
    try {
      fetch(segment, threads);              // run the Fetcher
      return 0;
    } catch (Exception e) {
      LOG.error("OldFetcher: " + StringUtils.stringifyException(e));
      return -1;
    }


  }
}
Source Code of org.apache.nutch.fetcher.OldFetcher$FetcherThread

Related Classes of org.apache.nutch.fetcher.OldFetcher$FetcherThread