Examples of org.apache.nutch.indexer.NutchDocument

org.apache.nutch.indexer.NutchDocument
A {@link NutchDocument} is the unit of indexing.

    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
    AnchorIndexingFilter filter = new AnchorIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://test1.com/", "text1"));
    inlinks.add(new Inlink("http://test2.com/", "text2"));
    inlinks.add(new Inlink("http://test3.com/", "text2"));
    try {
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
    } catch(Exception e){
      e.printStackTrace();
      Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
    Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
  }

View Full Code Here

  public void testEmptyIndexStatic() throws Exception {


    Assert.assertNotNull(filter);
    filter.setConf(conf);


    NutchDocument doc = new NutchDocument();


    try {
      filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
      e.printStackTrace();
      Assert.fail(e.getMessage());
    }


    Assert.assertNotNull(doc);
    Assert.assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty());
  }

View Full Code Here

    conf.set("index.static",
        "field1:val1, field2    :      val2 val3     , field3, field4 :val4 , ");
    Assert.assertNotNull(filter);
    filter.setConf(conf);


    NutchDocument doc = new NutchDocument();


    try {
      filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
      e.printStackTrace();
      Assert.fail(e.getMessage());
    }


    Assert.assertNotNull(doc);
    Assert.assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
    Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
    Assert.assertTrue("test if doc has field1", doc.getField("field1").getValues()
        .contains("val1"));
    Assert.assertTrue("test if doc has field2", doc.getField("field2").getValues()
        .contains("val2"));
    Assert.assertTrue("test if doc has field4", doc.getField("field4").getValues()
        .contains("val4"));
  }

View Full Code Here


    BasicIndexingFilter filter = new BasicIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);


    NutchDocument doc = new NutchDocument();


    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);


    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);


    Inlinks inlinks = new Inlinks();


    try {
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    } catch(Exception e){
      e.printStackTrace();
      Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
    Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
    Assert.assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
    Assert.assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", 
      doc.getField("url").getValues().get(0));
    Assert.assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
    Assert.assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
  }

View Full Code Here

    this.action = action;
  }


  public void readFields(DataInput in) throws IOException {
    action = in.readByte();
    doc = new NutchDocument();
    doc.readFields(in);
  }

View Full Code Here

    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    
    try{
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
    }
    catch(Exception e){
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertTrue(doc.getFieldNames().contains("type"));
    Assert.assertEquals(1, doc.getField("type").getValues().size());
    Assert.assertEquals("text/html", doc.getFieldValue("type"));    
  }

View Full Code Here

    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);


    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());


    Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
  }

View Full Code Here

  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }

View Full Code Here

  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
        ParseImpl parse = new ParseImpl("foo bar", new ParseData());
        
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
        catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue(doc.getFieldNames().contains("type"));
  assertEquals(1, doc.getField("type").getValues().size());
  assertEquals("text/html", doc.getFieldValue("type"));     
    }

View Full Code Here

  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }

View Full Code Here

0 1 2 3 4

TOP

Related Classes of org.apache.nutch.indexer.NutchDocument

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.AlternativeDataFlowIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlowTest

org.apache.nutch.indexer.anchor.TestAnchorIndexingFilter

org.apache.nutch.indexer.basic.TestBasicIndexingFilter

org.apache.nutch.indexer.IndexingFiltersChecker

org.apache.nutch.indexer.more.TestMoreIndexingFilter

org.apache.nutch.indexer.NutchIndexAction

org.apache.nutch.indexer.staticfield.TestStaticFieldIndexerTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.