Package net.nutch.pagedb

Examples of net.nutch.pagedb.FetchListEntry


    public void run() {

      FetcherOutput fetcherOutput = new FetcherOutput();
      Content content = new Content();

      FetchListEntry fle = null;
      String url = null;

      while (true) {
        if (LogFormatter.hasLoggedSevere())       // something bad happened
          break;                                  // exit

        t0 = System.currentTimeMillis();

        try {

          // must be read in order! thus synchronize threads.
          synchronized (ParseSegment.this) {
            t1 = System.currentTimeMillis();

            try {
              if (fetcherNPReader.next(fetcherOutput) == null ||
                contentReader.next(content) == null)
              return;
            } catch (EOFException eof) {
              // only partial data available, stop this thread,
              // other threads will be stopped also.
              return;
            }

            entry++;
            myEntry = entry;
            if (LOG.isLoggable(Level.FINE))
              LOG.fine("Read in entry "+entry);

            // safe guard against mismatched files
            //if (entry != fetcherNPReader.key() ||
            //    entry != contentReader.key()) {
            //  LOG.severe("Mismatched entries under "
            //    + FetcherOutput.DIR_NAME_NP + " and " + Content.DIR_NAME);
            //  continue;
            //}
          }

          t2 = System.currentTimeMillis();

          fle = fetcherOutput.getFetchListEntry();
          url = fle.getPage().getURL().toString();

          LOG.fine("parsing " + url);            // parse the page

          // safe guard against mismatched files
          if (!url.equals(content.getUrl())) {
View Full Code Here


     * of URLs to be fetched (in a thread-safe way).  It checks
     * whether the URL is OK to download.  If so, we do it.
     */
    public void run() {

      FetchListEntry fle = new FetchListEntry();

      while (true) {
        if (LogFormatter.hasLoggedSevere())       // something bad happened
          break;                                  // exit
       
        String url = null;
        try {

          if (fetchList.next(fle) == null)
            break;

          url = fle.getPage().getURL().toString();

          if (!fle.getFetch()) {                  // should we fetch this page?
            if (LOG.isLoggable(Level.FINE))
              LOG.fine("not fetching " + url);
            handleNoFetch(fle, FetcherOutput.SUCCESS);
            continue;
          }
View Full Code Here

      File sortedFile = new File(segmentDir, ".sorted");
      nfs.delete(unsortedFile);
      nfs.delete(sortedFile);
      SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
              unsortedFile.toString(), UTF8.class, LongWritable.class);
      FetchListEntry fle;
      LongWritable rec = new LongWritable();
      UTF8 url = new UTF8();
      String urlString;
      while (fetcherReader.next(fo) != null) {
        fle = fo.getFetchListEntry();
        urlString = fle.getPage().getURL().toString();
        rec.set(recNo);
        url.set(urlString);
        seqWriter.append(url, rec);
        recNo++;
      }
View Full Code Here

      if (unique) {
        rnd = "/" + System.currentTimeMillis();
        url += rnd;
      }
      url += "/example.html";
      FetchListEntry fle = new FetchListEntry(true, new Page(url, 1.0f), new String[] { "test" + rnd });
      FetcherOutput fo = new FetcherOutput(fle, MD5Hash.digest(url), FetcherOutput.SUCCESS);
      StringBuffer content = new StringBuffer("<html><body><h1>Hello from Page " + i + "</h1>");
      if (unique) {
        content.append("<p>Created at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong() + "</p>");
      }
View Full Code Here

TOP

Related Classes of net.nutch.pagedb.FetchListEntry

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.