Examples of org.apache.nutch.parse.ParseResult

org.apache.nutch.parse.ParseResult
A utility class that stores result of a parse. Internally a ParseResult stores < {@link Text}, {@link Parse}> pairs.
Parsers may return multiple results, which correspond to parts or other associated documents related to the original URL.

There will be usually one parse result that corresponds directly to the original URL, and possibly many (or none) results that correspond to derived URLs (or sub-URLs).

    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
      datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);


    ParseResult parseResult = null;
    if (content != null) {
      Metadata metadata = content.getMetadata();
      // add segment to metadata
      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
      // add score to content metadata so that ParseSegment can pick it up.
      try {
        scfilters.passScoreBeforeParsing(key, datum, content);
      }
      catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          e.printStackTrace(LogUtil.getWarnStream(LOG));
          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
        }
      }


      try {


        // parse the content
        parseResult = this.parseUtil.parse(content);
      }
      catch (Exception e) {
        LOG.warn("Error parsing: " + key + ": "
          + StringUtils.stringifyException(e));
      }


      // set the content signature
      if (parseResult == null) {
        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
          content, new ParseStatus().getEmptyParse(getConf()));
        datum.setSignature(signature);
      }


      try {
        output.collect(key, new NutchWritable(datum));
        output.collect(key, new NutchWritable(content));


        if (parseResult != null) {
          for (Entry <Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();


            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }


            // Calculate page signature. 
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
              content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
              segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
              StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
              Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            }
            catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                e.printStackTrace(LogUtil.getWarnStream(LOG));
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
              parse.getText()), parse.getData(), parse.isCanonical())));
          }
        }
      }
      catch (IOException e) {
        if (LOG.isErrorEnabled()) {
          LOG.error("ArcSegmentCreator caught:" + StringUtils.stringifyException(e));
        }
      }


      // return parse status if it exits
      if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
          return p.getData().getStatus();
        }
      }
    }

View Full Code Here

    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }
    
    ParseResult parseResult = new ParseUtil(conf).parse(content);
    
    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }

View Full Code Here

   *         were present in the feed file that this {@link Parser} dealt with.
   * 
   */
  public ParseResult getParse(Content content) {
    SyndFeed feed = null;
    ParseResult parseResult = new ParseResult(content.getUrl());


    EncodingDetector detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, defaultEncoding);
    try {
      InputSource input = new InputSource(new ByteArrayInputStream(content
          .getContent()));
      input.setEncoding(encoding);
      SyndFeedInput feedInput = new SyndFeedInput();
      feed = feedInput.build(input);
    } catch (Exception e) {
      // return empty parse
      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
          + StringUtils.stringifyException(e));
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    }


    List entries = feed.getEntries();
    String feedLink = feed.getLink();
    try {
      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
      if (feedLink != null)
        feedLink = filters.filter(feedLink);
    } catch (Exception e) {
      feedLink = null;
    }


    for (Iterator i = entries.iterator(); i.hasNext();) {
      SyndEntry entry = (SyndEntry) i.next();
      addToMap(parseResult, feed, feedLink, entry, content);
    }


    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());


    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));


    return parseResult;
  }

View Full Code Here

    parser.setConf(conf);
    File file = new File(name);
    byte[] bytes = new byte[(int) file.length()];
    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
        "application/rss+xml", new Metadata(), conf));
    for (Entry<Text, Parse> entry : parseResult) {
      System.out.println("key: " + entry.getKey());
      Parse parse = entry.getValue();
      System.out.println("data: " + parse.getData());

View Full Code Here

    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }


    ParseResult parseResult = new ParseUtil(conf).parse(content);


    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }

View Full Code Here

      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content
        .getUrl(), new ParseImpl(text, parseData));


    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content,
        parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
        entry.getValue().getData().getParseMeta().set(
            Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);

View Full Code Here

    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
      datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);


    ParseResult parseResult = null;
    if (content != null) {
      Metadata metadata = content.getMetadata();
      // add segment to metadata
      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
      // add score to content metadata so that ParseSegment can pick it up.
      try {
        scfilters.passScoreBeforeParsing(key, datum, content);
      }
      catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
        }
      }


      try {


        // parse the content
        parseResult = this.parseUtil.parse(content);
      }
      catch (Exception e) {
        LOG.warn("Error parsing: " + key + ": "
          + StringUtils.stringifyException(e));
      }


      // set the content signature
      if (parseResult == null) {
        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
          content, new ParseStatus().getEmptyParse(getConf()));
        datum.setSignature(signature);
      }


      try {
        output.collect(key, new NutchWritable(datum));
        output.collect(key, new NutchWritable(content));


        if (parseResult != null) {
          for (Entry <Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();


            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }


            // Calculate page signature. 
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
              content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
              segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
              StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
              Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            }
            catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
              parse.getText()), parse.getData(), parse.isCanonical())));
          }
        }
      }
      catch (IOException e) {
        if (LOG.isErrorEnabled()) {
          LOG.error("ArcSegmentCreator caught:" + StringUtils.stringifyException(e));
        }
      }


      // return parse status if it exits
      if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
          return p.getData().getStatus();
        }
      }
    }

View Full Code Here

   */
  public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    ParseResult parseResult;


    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
      urlString = urlString.replace('\\', '/');


      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();


      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);


      assertEquals(3, parseResult.size());


      boolean hasLink1 = false, hasLink2 = false, hasLink3=false;


      for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
          .hasNext();) {
        Map.Entry<Text, Parse> entry = j.next();
        if (entry.getKey().toString().equals(
            "http://www-scf.usc.edu/~mattmann/")) {
          hasLink1 = true;

View Full Code Here

   *         were present in the feed file that this {@link Parser} dealt with.
   * 
   */
  public ParseResult getParse(Content content) {
    SyndFeed feed = null;
    ParseResult parseResult = new ParseResult(content.getUrl());


    EncodingDetector detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, defaultEncoding);
    try {
      InputSource input = new InputSource(new ByteArrayInputStream(content
          .getContent()));
      input.setEncoding(encoding);
      SyndFeedInput feedInput = new SyndFeedInput();
      feed = feedInput.build(input);
    } catch (Exception e) {
      // return empty parse
      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
          + StringUtils.stringifyException(e));
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    }


    List entries = feed.getEntries();
    String feedLink = feed.getLink();
    try {
      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
      if (feedLink != null)
        feedLink = filters.filter(feedLink);
    } catch (Exception e) {
      feedLink = null;
    }


    for (Iterator i = entries.iterator(); i.hasNext();) {
      SyndEntry entry = (SyndEntry) i.next();
      addToMap(parseResult, feed, feedLink, entry, content);
    }


    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());


    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));


    return parseResult;
  }

View Full Code Here

    parser.setConf(conf);
    File file = new File(name);
    byte[] bytes = new byte[(int) file.length()];
    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
        "application/rss+xml", new Metadata(), conf));
    for (Entry<Text, Parse> entry : parseResult) {
      System.out.println("key: " + entry.getKey());
      Parse parse = entry.getValue();
      System.out.println("data: " + parse.getData());

View Full Code Here

0 1 2 3

TOP

Related Classes of org.apache.nutch.parse.ParseResult

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

org.apache.hadoop.io.Text

org.apache.nutch.indexer.IndexingFiltersChecker

org.apache.nutch.parse.feed.FeedParser

org.apache.nutch.parse.feed.TestFeedParser

org.apache.nutch.parse.mp3.MP3Parser

org.apache.nutch.parse.tika.TikaParser

org.apache.nutch.tools.arc.ArcSegmentCreator

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.