Package org.apache.nutch.pagedb

Examples of org.apache.nutch.pagedb.FetchListEntry


  }

  public ProtocolOutput getProtocolOutput(String urlString) {
    ProtocolOutput output = null;
    try {
      return getProtocolOutput(new FetchListEntry(true,
            new Page(urlString, 1.0f), new String[0]));
    } catch (MalformedURLException mue) {
      return new ProtocolOutput(null, new ProtocolStatus(mue));
    }
  }
View Full Code Here


  public void setMaxContentLength(int length) {maxContentLength = length;}

  public ProtocolOutput getProtocolOutput(String urlString) {
    ProtocolOutput output = null;
    try {
      return getProtocolOutput(new FetchListEntry(true,
            new Page(urlString, 1.0f), new String[0]));
    } catch (MalformedURLException mue) {
      return new ProtocolOutput(null, new ProtocolStatus(mue));
    }
  }
View Full Code Here

      File sortedFile = new File(segmentDir, ".sorted");
      nfs.delete(unsortedFile);
      nfs.delete(sortedFile);
      SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
              unsortedFile.toString(), UTF8.class, LongWritable.class);
      FetchListEntry fle;
      LongWritable rec = new LongWritable();
      UTF8 url = new UTF8();
      String urlString;
      while (fetcherReader.next(fo) != null) {
        fle = fo.getFetchListEntry();
        urlString = fle.getPage().getURL().toString();
        rec.set(recNo);
        url.set(urlString);
        seqWriter.append(url, rec);
        recNo++;
      }
View Full Code Here

     * of URLs to be fetched (in a thread-safe way).  It checks
     * whether the URL is OK to download.  If so, we do it.
     */
    public void run() {

      FetchListEntry fle = new FetchListEntry();

      while (true) {
        if (LogFormatter.hasLoggedSevere())       // something bad happened
          break;                                  // exit
       
        String url = null;
        try {

          if (fetchList.next(fle) == null)
            break;

          url = fle.getPage().getURL().toString();

          if (!fle.getFetch()) {                  // should we fetch this page?
            if (LOG.isLoggable(Level.FINE))
              LOG.fine("not fetching " + url);
            handleFetch(fle, new ProtocolOutput(null, ProtocolStatus.STATUS_NOTFETCHING));
            continue;
          }

          // support multiple redirects, if requested by protocol
          // or content meta-tags (the latter requires running Fetcher
          // in parsing mode). Protocol-level redirects take precedence over
          // content-level redirects. Some plugins can handle redirects
          // automatically, so that only the final success or failure will be
          // reported here.
          boolean refetch = false;
          int redirCnt = 0;
          do {
            LOG.fine("redirCnt=" + redirCnt);
            refetch = false;
            LOG.info("fetching " + url);            // fetch the page
            Protocol protocol = ProtocolFactory.getProtocol(url);
            ProtocolOutput output = protocol.getProtocolOutput(fle);
            ProtocolStatus pstat = output.getStatus();
            Content content = output.getContent();
            switch(pstat.getCode()) {
              case ProtocolStatus.SUCCESS:
                if (content != null) {
                  synchronized (Fetcher.this) {           // update status
                    pages++;
                    bytes += content.getContent().length;
                    if ((pages % 100) == 0) {             // show status every 100pp
                      status();
                    }
                  }
                  ParseStatus ps = handleFetch(fle, output);
                  if (ps != null && ps.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                    String newurl = ps.getMessage();
                    newurl = URLFilters.filter(newurl);
                    if (newurl != null && !newurl.equals(url)) {
                      refetch = true;
                      url = newurl;
                      redirCnt++;
                      fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
                      LOG.fine(" - content redirect to " + url);
                    } else {
                      LOG.fine(" - content redirect skipped, " +
                              (url.equals(newurl)? "newurl == url" : "prohibited by urlfilter"));
                    }
                  }
                }
                break;
              case ProtocolStatus.MOVED: // try to redirect immediately
              case ProtocolStatus.TEMP_MOVED: // try to redirect immediately
                // record the redirect. perhaps the DB will want to know this.
                handleFetch(fle, output);
                String newurl = pstat.getMessage();
                newurl = URLFilters.filter(newurl);
                if (newurl != null && !newurl.equals(url)) {
                  refetch = true;
                  url = newurl;
                  redirCnt++;
                  // create new entry.
                  fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
                  LOG.info(" - protocol redirect to " + url);
                } else {
                  LOG.fine(" - protocol redirect skipped, " +
                          (url.equals(newurl)? "newurl == url" : "prohibited by urlfilter"));
                }
View Full Code Here

    public void run() {

      FetcherOutput fetcherOutput = new FetcherOutput();
      Content content = new Content();

      FetchListEntry fle = null;
      String url = null;

      while (true) {
        if (LogFormatter.hasLoggedSevere())       // something bad happened
          break;                                  // exit

        t0 = System.currentTimeMillis();

        try {

          // must be read in order! thus synchronize threads.
          synchronized (ParseSegment.this) {
            t1 = System.currentTimeMillis();

            try {
              if (fetcherNPReader.next(fetcherOutput) == null ||
                contentReader.next(content) == null)
              return;
            } catch (EOFException eof) {
              // only partial data available, stop this thread,
              // other threads will be stopped also.
              return;
            }

            entry++;
            myEntry = entry;
            if (LOG.isLoggable(Level.FINE))
              LOG.fine("Read in entry "+entry);

            // safe guard against mismatched files
            //if (entry != fetcherNPReader.key() ||
            //    entry != contentReader.key()) {
            //  LOG.severe("Mismatched entries under "
            //    + FetcherOutput.DIR_NAME_NP + " and " + Content.DIR_NAME);
            //  continue;
            //}
          }

          t2 = System.currentTimeMillis();

          fle = fetcherOutput.getFetchListEntry();
          url = fle.getPage().getURL().toString();

          LOG.fine("parsing " + url);            // parse the page

          // safe guard against mismatched files
          if (!url.equals(content.getUrl())) {
View Full Code Here

  }

  public ProtocolOutput getProtocolOutput(String urlString) {
    ProtocolOutput output = null;
    try {
      return getProtocolOutput(new FetchListEntry(true,
            new Page(urlString, 1.0f), new String[0]));
    } catch (MalformedURLException mue) {
      return new ProtocolOutput(null, new ProtocolStatus(mue));
    }
  }
View Full Code Here

    }
  }

  public ProtocolOutput getProtocolOutput(String urlString) {
    try {
      return getProtocolOutput(new FetchListEntry(true, new Page(urlString, 1.0f), new String[0]));
    } catch (MalformedURLException mue) {
      return new ProtocolOutput(null, new ProtocolStatus(mue));
    }
  }
View Full Code Here

      if (unique) {
        rnd = "/" + System.currentTimeMillis();
        url += rnd;
      }
      url += "/example.html";
      FetchListEntry fle = new FetchListEntry(true, new Page(url, 1.0f), new String[] { "test" + rnd });
      FetcherOutput fo = new FetcherOutput(fle, MD5Hash.digest(url), ProtocolStatus.STATUS_SUCCESS);
      StringBuffer content = new StringBuffer("<html><body><h1>Hello from Page " + i + "</h1>");
      if (unique) {
        content.append("<p>Created at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong() + "</p>");
      }
View Full Code Here

      File sortedFile = new File(segmentDir, ".sorted");
      nfs.delete(unsortedFile);
      nfs.delete(sortedFile);
      SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
              unsortedFile.toString(), UTF8.class, LongWritable.class);
      FetchListEntry fle;
      LongWritable rec = new LongWritable();
      UTF8 url = new UTF8();
      String urlString;
      while (fetcherReader.next(fo) != null) {
        fle = fo.getFetchListEntry();
        urlString = fle.getPage().getURL().toString();
        rec.set(recNo);
        url.set(urlString);
        seqWriter.append(url, rec);
        recNo++;
      }
View Full Code Here

TOP

Related Classes of org.apache.nutch.pagedb.FetchListEntry

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.