Examples of org.apache.nutch.parse.ParseImpl

org.apache.nutch.parse.ParseImpl
The result of parsing a page's raw content. @see Parser#getParse(Content)

          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content
        .getUrl(), new ParseImpl(text, parseData));


    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content,
        parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache

View Full Code Here

    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
    AnchorIndexingFilter filter = new AnchorIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://test1.com/", "text1"));
    inlinks.add(new Inlink("http://test2.com/", "text2"));
    inlinks.add(new Inlink("http://test3.com/", "text2"));
    try {

View Full Code Here

    final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
                                              resultTitle, outlinks,
                                              content.getMetadata());


    if (LOG.isTraceEnabled()) { LOG.trace("Zip file parsed sucessfully !!"); }
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(resultText, parseData));
  }

View Full Code Here

  StaticFieldIndexer filter;


  @Before
  public void setUp() throws Exception {
    conf = NutchConfiguration.create();
    parse = new ParseImpl();
    url = new Text("http://nutch.apache.org/index.html");
    crawlDatum = new CrawlDatum();
    inlinks = new Inlinks();
    filter = new StaticFieldIndexer();
  }

View Full Code Here

    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);


    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);


    Inlinks inlinks = new Inlinks();

View Full Code Here

    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));


    // add digest, used by dedup
    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));


    final Parse parse = new ParseImpl(parseText, parseData);
    try {
      // extract information from dbDatum and pass it to
      // fetchDatum so that indexing filters can use it
      final Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
      if (url != null) {

View Full Code Here

    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");


    IndexingFilters filters = new IndexingFilters(conf);
    NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
     
    Assert.assertNull(doc);
  }

View Full Code Here


    String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);


    IndexingFilters filters1 = new IndexingFilters(conf);
    NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
      new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"),
      new CrawlDatum(),new Inlinks());


    // add another index filter
    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
    // set content metadata
    Metadata md = new Metadata();
    md.add("example","data");
    // set content metadata property defined in MetadataIndexer
    conf.set("index.content.md","example");
    // add MetadataIndxer filter
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters2 = new IndexingFilters(conf);
    NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
      new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"),
      new CrawlDatum(),new Inlinks());
    Assert.assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());
  }

View Full Code Here

    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata());
    return ParseResult.createParseResult(content.getUrl(), 
                                         new ParseImpl(text, parseData));
  }

View Full Code Here

    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    
    try{
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
    }
    catch(Exception e){

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.parse.ParseImpl

org.apache.nutch.crawl.TextProfileSignature

org.apache.nutch.indexer.anchor.TestAnchorIndexingFilter

org.apache.nutch.indexer.basic.TestBasicIndexingFilter

org.apache.nutch.indexer.field.BasicFields$Extractor

org.apache.nutch.indexer.IndexerMapReduce

org.apache.nutch.indexer.more.TestMoreIndexingFilter

org.apache.nutch.indexer.staticfield.TestStaticFieldIndexerTest

org.apache.nutch.indexer.TestIndexingFilters

org.apache.nutch.parse.ext.ExtParser

org.apache.nutch.parse.ext.WaxExtParser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.