Examples of NutchWritable

org.apache.nutch.crawl.NutchWritable

Examples of org.apache.nutch.crawl.NutchWritable

         */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
      }


      try {
        output.collect(key, new NutchWritable(datum));
        if (content != null && storingContent)
          output.collect(key, new NutchWritable(content));
        if (parseResult != null) {
          for (Entry<Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();
            
            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }


            // Calculate page signature. For non-parsing fetchers this will
            // be done in ParseSegment
            byte[] signature = 
              SignatureFactory.getSignature(getConf()).calculate(content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
                segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
                StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
                Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            } catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                e.printStackTrace(LogUtil.getWarnStream(LOG));
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(
                    new ParseImpl(new ParseText(parse.getText()), 
                                  parse.getData(), parse.isCanonical())));
          }
        }
      } catch (IOException e) {

View Full Code Here

Examples of org.apache.nutch.crawl.NutchWritable

      return;
    } else {
      key.set(urlString);
    }


    output.collect(key, new NutchWritable(value));
  }

View Full Code Here

Examples of org.apache.nutch.crawl.NutchWritable

         */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
      }


      try {
        output.collect(key, new NutchWritable(datum));
        if (content != null && storingContent)
          output.collect(key, new NutchWritable(content));
        if (parseResult != null) {
          for (Entry<Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();
            ParseData parseData = parse.getData();


            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }


            // Calculate page signature. For non-parsing fetchers this will
            // be done in ParseSegment
            byte[] signature =
              SignatureFactory.getSignature(getConf()).calculate(content, parse);
            // Ensure segment name and score are in parseData metadata
            parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
                segmentName);
            parseData.getContentMeta().set(Nutch.SIGNATURE_KEY,
                StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY,
                Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            } catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }


            String fromHost;


            // collect outlinks for subsequent db update
            Outlink[] links = parseData.getOutlinks();
            int outlinksToStore = Math.min(maxOutlinks, links.length);
            if (ignoreExternalLinks) {
              try {
                fromHost = new URL(url.toString()).getHost().toLowerCase();
              } catch (MalformedURLException e) {
                fromHost = null;
              }
            } else {
              fromHost = null;
            }


            int validCount = 0;


            // Process all outlinks, normalize, filter and deduplicate
            List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
            HashSet<String> outlinks = new HashSet<String>(outlinksToStore);
            for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
              String toUrl = links[i].getToUrl();


              toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, fromHost, ignoreExternalLinks, urlFilters, normalizers);
              if (toUrl == null) {
                continue;
              }


              validCount++;
              links[i].setUrl(toUrl);
              outlinkList.add(links[i]);
              outlinks.add(toUrl);
            }


            // Only process depth N outlinks
            if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
              reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size());


              // Counter to limit num outlinks to follow per page
              int outlinkCounter = 0;


              // Calculate variable number of outlinks by depth using the divisor (outlinks = Math.floor(divisor / depth * num.links))
              int maxOutlinksByDepth = (int)Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);


              String followUrl;


              // Walk over the outlinks and add as new FetchItem to the queues
              Iterator<String> iter = outlinks.iterator();
              while(iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
                followUrl = iter.next();


                // Check whether we'll follow external outlinks
                if (outlinksIgnoreExternal) {
                  if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                    continue;
                  }
                }


                reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1);


                // Create new FetchItem with depth incremented
                FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
                fetchQueues.addFetchItem(fit);


                outlinkCounter++;
              }
            }


            // Overwrite the outlinks in ParseData with the normalized and filtered set
            parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));


            output.collect(url, new NutchWritable(
                    new ParseImpl(new ParseText(parse.getText()),
                                  parseData, parse.isCanonical())));
          }
        }
      } catch (IOException e) {

View Full Code Here

Examples of org.apache.nutch.crawl.NutchWritable

      // UTF8 deprecated and replaced by Text.
      if (key instanceof Text) {
        newKey.set(key.toString());
        key = newKey;
      }
      collector.collect((Text)key, new NutchWritable(value));
    }

View Full Code Here

Examples of org.apache.nutch.crawl.NutchWritable

         */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
      }


      try {
        output.collect(key, new NutchWritable(datum));
        if (content != null && storingContent)
          output.collect(key, new NutchWritable(content));
        if (parseResult != null) {
          for (Entry<Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();
            
            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }


            // Calculate page signature. For non-parsing fetchers this will
            // be done in ParseSegment
            byte[] signature = 
              SignatureFactory.getSignature(getConf()).calculate(content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
                segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
                StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
                Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            } catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(
                    new ParseImpl(new ParseText(parse.getText()), 
                                  parse.getData(), parse.isCanonical())));
          }
        }
      } catch (IOException e) {

View Full Code Here

Examples of org.apache.nutch.crawl.NutchWritable

        if (datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
            datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
            datum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {


            // Tell the reducer to get rid of all instances of this key
            output.collect(key, new NutchWritable(new BooleanWritable(true)));
        }
      }
      else if (value instanceof ParseData) {
        // get the parse data and the outlinks from the parse data, along with
        // the fetch time for those links
        ParseData data = (ParseData)value;
        long fetchTime = getFetchTime(data);
        Outlink[] outlinkAr = data.getOutlinks();
        Map<String, String> outlinkMap = new LinkedHashMap<String, String>();


        // normalize urls and put into map
        if (outlinkAr != null && outlinkAr.length > 0) {
          for (int i = 0; i < outlinkAr.length; i++) {
            Outlink outlink = outlinkAr[i];
            String toUrl = normalizeUrl(outlink.getToUrl());


            if (filterUrl(toUrl) == null) {
              continue;
            }


            // only put into map if the url doesn't already exist in the map or
            // if it does and the anchor for that link is null, will replace if
            // url is existing
            boolean existingUrl = outlinkMap.containsKey(toUrl);
            if (toUrl != null
              && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
              outlinkMap.put(toUrl, outlink.getAnchor());
            }
          }
        }


        // collect the outlinks under the fetch time
        for (String outlinkUrl : outlinkMap.keySet()) {
          String anchor = outlinkMap.get(outlinkUrl);
          LinkDatum datum = new LinkDatum(outlinkUrl, anchor, fetchTime);
          output.collect(key, new NutchWritable(datum));
        }
      }
      else if (value instanceof LinkDatum) {
        LinkDatum datum = (LinkDatum)value;
        String linkDatumUrl = normalizeUrl(datum.getUrl());


        if (filterUrl(linkDatumUrl) != null) {
          datum.setUrl(linkDatumUrl);


          // collect existing outlinks from existing OutlinkDb
          output.collect(key, new NutchWritable(datum));
        }
      }
    }

View Full Code Here

Examples of org.apache.nutch.crawl.NutchWritable

    this.scfilters = new ScoringFilters(getConf());
  }


  public void map(Text key, Writable value,
      OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
    output.collect(key, new NutchWritable(value));
  }

View Full Code Here

Examples of org.apache.nutch.crawl.NutchWritable

         */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
      }


      try {
        output.collect(key, new NutchWritable(datum));
        if (content != null && storingContent)
          output.collect(key, new NutchWritable(content));
        if (parseResult != null) {
          for (Entry<Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();
            
            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }


            // Calculate page signature. For non-parsing fetchers this will
            // be done in ParseSegment
            byte[] signature = 
              SignatureFactory.getSignature(getConf()).calculate(content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
                segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
                StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
                Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            } catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                e.printStackTrace(LogUtil.getWarnStream(LOG));
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(
                    new ParseImpl(new ParseText(parse.getText()), 
                                  parse.getData(), parse.isCanonical())));
          }
        }
      } catch (IOException e) {

View Full Code Here

Examples of org.apache.nutch.crawl.NutchWritable

      return;
    } else {
      key.set(urlString);
    }


    output.collect(key, new NutchWritable(value));
  }

View Full Code Here

Examples of org.apache.nutch.crawl.NutchWritable

    this.scfilters = new ScoringFilters(getConf());
  }


  public void map(Text key, Writable value,
      OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
    output.collect(key, new NutchWritable(value));
  }

View Full Code Here

0 1 2 3

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.