Examples of org.apache.nutch.parse.ParseData

org.apache.nutch.parse.ParseData
Data extracted from a page's content. @see Parse#getData()

    add(urlStr, doc, "encoding", parse.getData().getMeta(ENCODING_KEY),
      false, true, true, false, false);


    // Get metadatas.
    MapWritable mw = datum.getMetaData();
    ParseData pd = parse.getData();


    // Add as stored, indexed, and untokenized but not lowercased.
    add(urlStr, doc, ARCCOLLECTION_KEY,
      getMetadataValue(ARCCOLLECTION_KEY, pd, mw),
      false, true, true, false);

View Full Code Here

    }


    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
      outlinks, content.getMetadata());
    parseData.setConf(this.conf);
    
    return new ParseImpl(text, parseData);
  }

View Full Code Here

    }


    if (text == null) { text = ""; }
    if (title == null) { title = ""; }


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    parseData.setConf(this.conf);
    return new ParseImpl(text, parseData);
  }

View Full Code Here

    String class1 = "NonExistingFilter";
    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);


    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }

View Full Code Here

    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");


    IndexingFilters filters = new IndexingFilters(conf);
    NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
     
    Assert.assertNull(doc);
  }

View Full Code Here


    String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);


    IndexingFilters filters1 = new IndexingFilters(conf);
    NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
      new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"),
      new CrawlDatum(),new Inlinks());


    // add another index filter
    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
    // set content metadata
    Metadata md = new Metadata();
    md.add("example","data");
    // set content metadata property defined in MetadataIndexer
    conf.set("index.content.md","example");
    // add MetadataIndxer filter
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters2 = new IndexingFilters(conf);
    NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
      new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"),
      new CrawlDatum(),new Inlinks());
    Assert.assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());
  }

View Full Code Here

    reader = new SequenceFile.Reader(fs, parseData, conf);
    
    READ_PARSE_DATA:
      do {
      Text key = new Text();
      ParseData value = new ParseData();
      if(!reader.next(key, value)) break READ_PARSE_DATA;
      // make sure they all contain "nutch.segment.name" and "nutch.content.digest" 
      // keys in parse metadata
      Metadata contentMeta = value.getContentMeta();
      if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null 
            && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
        handledurls.add(key.toString());
      }
    } while(true);

View Full Code Here

    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content
        .getUrl(), new ParseImpl(text, parseData));


    // run filters on parse

View Full Code Here

    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
    AnchorIndexingFilter filter = new AnchorIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://test1.com/", "text1"));
    inlinks.add(new Inlink("http://test2.com/", "text2"));
    inlinks.add(new Inlink("http://test3.com/", "text2"));
    try {

View Full Code Here

    }


    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());


    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));


    return parseResult;
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.parse.ParseData

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilterTest

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.AlternativeDataFlowIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlowTest

org.apache.commons.cli.Options

org.apache.hadoop.conf.Configuration

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.util.GenericOptionsParser

org.apache.nutch.fetcher.TestFetcher

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.