Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.ParseResult


    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }

    ParseResult parseResult = new ParseUtil(conf).parse(content);

    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }
View Full Code Here


      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content
        .getUrl(), new ParseImpl(text, parseData));

    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content,
        parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
        entry.getValue().getData().getParseMeta().set(
            Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
View Full Code Here

   *         were present in the feed file that this {@link Parser} dealt with.
   *
   */
  public ParseResult getParse(Content content) {
    SyndFeed feed = null;
    ParseResult parseResult = new ParseResult(content.getUrl());

    EncodingDetector detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, defaultEncoding);
    try {
      InputSource input = new InputSource(new ByteArrayInputStream(content
          .getContent()));
      input.setEncoding(encoding);
      SyndFeedInput feedInput = new SyndFeedInput();
      feed = feedInput.build(input);
    } catch (Exception e) {
      // return empty parse
      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
          + StringUtils.stringifyException(e));
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    List entries = feed.getEntries();
    String feedLink = feed.getLink();
    try {
      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
      if (feedLink != null)
        feedLink = filters.filter(feedLink);
    } catch (Exception e) {
      feedLink = null;
    }

    for (Iterator i = entries.iterator(); i.hasNext();) {
      SyndEntry entry = (SyndEntry) i.next();
      addToMap(parseResult, feed, feedLink, entry, content);
    }

    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());

    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));

    return parseResult;
  }
View Full Code Here

    parser.setConf(conf);
    File file = new File(name);
    byte[] bytes = new byte[(int) file.length()];
    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
        "application/rss+xml", new Metadata(), conf));
    for (Entry<Text, Parse> entry : parseResult) {
      System.out.println("key: " + entry.getKey());
      Parse parse = entry.getValue();
      System.out.println("data: " + parse.getData());
View Full Code Here

      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content
        .getUrl(), new ParseImpl(text, parseData));

    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content,
        parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
        entry.getValue().getData().getParseMeta().set(
            Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
View Full Code Here

   */
  public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    ParseResult parseResult;

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
      urlString = urlString.replace('\\', '/');

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();

      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);

      assertEquals(3, parseResult.size());

      boolean hasLink1 = false, hasLink2 = false, hasLink3=false;

      for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
          .hasNext();) {
        Map.Entry<Text, Parse> entry = j.next();
        if (entry.getKey().toString().equals(
            "http://www-scf.usc.edu/~mattmann/")) {
          hasLink1 = true;
View Full Code Here

    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
      datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

    ParseResult parseResult = null;
    if (content != null) {
      Metadata metadata = content.getMetadata();
      // add segment to metadata
      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
      // add score to content metadata so that ParseSegment can pick it up.
      try {
        scfilters.passScoreBeforeParsing(key, datum, content);
      }
      catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          e.printStackTrace(LogUtil.getWarnStream(LOG));
          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
        }
      }

      try {

        // parse the content
        parseResult = this.parseUtil.parse(content);
      }
      catch (Exception e) {
        LOG.warn("Error parsing: " + key + ": "
          + StringUtils.stringifyException(e));
      }

      // set the content signature
      if (parseResult == null) {
        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
          content, new ParseStatus().getEmptyParse(getConf()));
        datum.setSignature(signature);
      }

      try {
        output.collect(key, new NutchWritable(datum));
        output.collect(key, new NutchWritable(content));

        if (parseResult != null) {
          for (Entry <Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();

            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }

            // Calculate page signature.
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
              content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
              segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
              StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
              Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            }
            catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                e.printStackTrace(LogUtil.getWarnStream(LOG));
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
              parse.getText()), parse.getData(), parse.isCanonical())));
          }
        }
      }
      catch (IOException e) {
        if (LOG.isFatalEnabled()) {
          LOG.fatal("ArcSegmentCreator caught:" + StringUtils.stringifyException(e));
        }
      }

      // return parse status if it exits
      if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
          return p.getData().getStatus();
        }
      }
    }
View Full Code Here

    xmlInputStream.read(rawXmlContent);
   
    // Mock objects
    Content content = PowerMockito.mock(Content.class)
    Parse parse = mock(Parse.class);
    ParseResult parseResult = mock(ParseResult.class);
    ParseData parseData = PowerMockito.mock(ParseData.class)
    Configuration configuration = mock(Configuration.class);
   
    // Mock data
    when(content.getContent()).thenReturn(rawXmlContent);
    when(content.getContentType()).thenReturn("application/xml");
    when(content.getUrl()).thenReturn("http://www.test.com/");
    when(parseResult.get(anyString())).thenReturn(parse);
    when(parse.getData()).thenReturn(parseData);
    when(parseData.getParseMeta()).thenReturn(new Metadata());
    when(configuration.get(anyString())).thenReturn("");
   
    when(configuration.getConfResourceAsReader(anyString())).thenReturn(new InputStreamReader(XPathIndexingFilterTest.class.getResourceAsStream("example-xpathfilter-conf.xml")));
   
    xmlHtmlParser.setConf(configuration);
    ParseResult parseResultReturn = xmlHtmlParser.filter(content, parseResult, null, null);
   
    int stringValueIndexCount = 0;
    int floatValueIndexCount = 0;
   
    Metadata metadata = parseResultReturn.get("http://www.test.com/").getData().getParseMeta();
    for(String stringValue : metadata.getValues("testString")) {
      int index = Arrays.binarySearch(testStringArray, stringValue);
      stringValueIndexCount += index;
      assertTrue("String value not found!", stringValueIndexCount >= 0);
    }
View Full Code Here

    htmlInputStream.read(rawHtmlContent);
   
    // Mock objects
    Content content = PowerMockito.mock(Content.class)
    Parse parse = mock(Parse.class);
    ParseResult parseResult = mock(ParseResult.class);
    ParseData parseData = PowerMockito.mock(ParseData.class)
    Configuration configuration = mock(Configuration.class);
   
    // Mock data
    when(content.getContent()).thenReturn(rawHtmlContent);
    when(content.getContentType()).thenReturn("text/html");
    when(content.getUrl()).thenReturn("http://www.test.com/");
    when(parseResult.get(anyString())).thenReturn(parse);
    when(parse.getData()).thenReturn(parseData);
    when(parseData.getParseMeta()).thenReturn(new Metadata());
    when(configuration.get(anyString())).thenReturn("");
    when(configuration.get("parser.character.encoding.default", "UTF-8")).thenReturn("UTF-8");
    when(configuration.getConfResourceAsReader(anyString())).thenReturn(new InputStreamReader(XPathIndexingFilterTest.class.getResourceAsStream("example-xpathfilter-conf2.xml")));
   
    xmlHtmlParser.setConf(configuration);
    ParseResult parseResultReturn = xmlHtmlParser.filter(content, parseResult, null, null);
    Metadata metadata = parseResultReturn.get("http://www.test.com/").getData().getParseMeta();
   
    assertEquals("Error parsing html", "Samir ELJAZOVIĆ", metadata.getValues("articleAuthor")[0]);
    assertEquals("Error parsing html", "Amazon Elastic MapReduce – Part 2 (Amazon S3 Input Format)", metadata.getValues("articleTitle")[0]);
  }
View Full Code Here

      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content
        .getUrl(), new ParseImpl(text, parseData));

    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content,
        parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
        entry.getValue().getData().getParseMeta().set(
            Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.ParseResult

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.