Package org.apache.nutch.crawl

Examples of org.apache.nutch.crawl.NutchWritable


      // UTF8 deprecated and replaced by Text.
      if (key instanceof Text) {
        newKey.set(key.toString());
        key = newKey;
      }
      collector.collect((Text)key, new NutchWritable(value));
    }
View Full Code Here


         */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
      }

      try {
        output.collect(key, new NutchWritable(datum));
        if (content != null && storingContent)
          output.collect(key, new NutchWritable(content));
        if (parseResult != null) {
          for (Entry<Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();
           
            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }

            // Calculate page signature. For non-parsing fetchers this will
            // be done in ParseSegment
            byte[] signature =
              SignatureFactory.getSignature(getConf()).calculate(content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
                segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
                StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
                Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            } catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(
                    new ParseImpl(new ParseText(parse.getText()),
                                  parse.getData(), parse.isCanonical())));
          }
        }
      } catch (IOException e) {
View Full Code Here

        if (datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
            datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
            datum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {

            // Tell the reducer to get rid of all instances of this key
            output.collect(key, new NutchWritable(new BooleanWritable(true)));
        }
      }
      else if (value instanceof ParseData) {
        // get the parse data and the outlinks from the parse data, along with
        // the fetch time for those links
        ParseData data = (ParseData)value;
        long fetchTime = getFetchTime(data);
        Outlink[] outlinkAr = data.getOutlinks();
        Map<String, String> outlinkMap = new LinkedHashMap<String, String>();

        // normalize urls and put into map
        if (outlinkAr != null && outlinkAr.length > 0) {
          for (int i = 0; i < outlinkAr.length; i++) {
            Outlink outlink = outlinkAr[i];
            String toUrl = normalizeUrl(outlink.getToUrl());

            if (filterUrl(toUrl) == null) {
              continue;
            }

            // only put into map if the url doesn't already exist in the map or
            // if it does and the anchor for that link is null, will replace if
            // url is existing
            boolean existingUrl = outlinkMap.containsKey(toUrl);
            if (toUrl != null
              && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
              outlinkMap.put(toUrl, outlink.getAnchor());
            }
          }
        }

        // collect the outlinks under the fetch time
        for (String outlinkUrl : outlinkMap.keySet()) {
          String anchor = outlinkMap.get(outlinkUrl);
          LinkDatum datum = new LinkDatum(outlinkUrl, anchor, fetchTime);
          output.collect(key, new NutchWritable(datum));
        }
      }
      else if (value instanceof LinkDatum) {
        LinkDatum datum = (LinkDatum)value;
        String linkDatumUrl = normalizeUrl(datum.getUrl());

        if (filterUrl(linkDatumUrl) != null) {
          datum.setUrl(linkDatumUrl);

          // collect existing outlinks from existing OutlinkDb
          output.collect(key, new NutchWritable(datum));
        }
      }
    }
View Full Code Here

          content, new ParseStatus().getEmptyParse(getConf()));
        datum.setSignature(signature);
      }

      try {
        output.collect(key, new NutchWritable(datum));
        output.collect(key, new NutchWritable(content));

        if (parseResult != null) {
          for (Entry <Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();

            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }

            // Calculate page signature.
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
              content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
              segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
              StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
              Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            }
            catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
              parse.getText()), parse.getData(), parse.isCanonical())));
          }
        }
      }
      catch (IOException e) {
View Full Code Here

      return;
    } else {
      key.set(urlString);
    }

    output.collect(key, new NutchWritable(value));
  }
View Full Code Here

        if (datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
            datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
            datum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {

          // Tell the reducer to get rid of all instances of this key
          output.collect(key, new NutchWritable(new BooleanWritable(true)));
        }
      }
      else if (value instanceof ParseData) {
        // get the parse data and the outlinks from the parse data, along with
        // the fetch time for those links
        ParseData data = (ParseData)value;
        long fetchTime = getFetchTime(data);
        Outlink[] outlinkAr = data.getOutlinks();
        Map<String, String> outlinkMap = new LinkedHashMap<String, String>();

        // normalize urls and put into map
        if (outlinkAr != null && outlinkAr.length > 0) {
          for (int i = 0; i < outlinkAr.length; i++) {
            Outlink outlink = outlinkAr[i];
            String toUrl = normalizeUrl(outlink.getToUrl());

            if (filterUrl(toUrl) == null) {
              continue;
            }

            // only put into map if the url doesn't already exist in the map or
            // if it does and the anchor for that link is null, will replace if
            // url is existing
            boolean existingUrl = outlinkMap.containsKey(toUrl);
            if (toUrl != null
                && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
              outlinkMap.put(toUrl, outlink.getAnchor());
            }
          }
        }

        // collect the outlinks under the fetch time
        for (String outlinkUrl : outlinkMap.keySet()) {
          String anchor = outlinkMap.get(outlinkUrl);
          LinkDatum datum = new LinkDatum(outlinkUrl, anchor, fetchTime);
          output.collect(key, new NutchWritable(datum));
        }
      }
      else if (value instanceof LinkDatum) {
        LinkDatum datum = (LinkDatum)value;
        String linkDatumUrl = normalizeUrl(datum.getUrl());

        if (filterUrl(linkDatumUrl) != null) {
          datum.setUrl(linkDatumUrl);

          // collect existing outlinks from existing OutlinkDb
          output.collect(key, new NutchWritable(datum));
        }
      }
    }
View Full Code Here

TOP

Related Classes of org.apache.nutch.crawl.NutchWritable

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.