Package org.apache.nutch.indexer

Examples of org.apache.nutch.indexer.NutchDocument


  public void testEmptyIndexStatic() throws Exception {

    assertNotNull(filter);
    filter.setConf(conf);

    NutchDocument doc = new NutchDocument();

    try {
      filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }

    assertNotNull(doc);
    assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty());
  }
View Full Code Here


    conf.set("index.static",
        "field1:val1, field2    :      val2 val3     , field3, field4 :val4 , ");
    assertNotNull(filter);
    filter.setConf(conf);

    NutchDocument doc = new NutchDocument();

    try {
      filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }

    assertNotNull(doc);
    assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
    assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
    assertTrue("test if doc has field1", doc.getField("field1").getValues()
        .contains("val1"));
    assertTrue("test if doc has field2", doc.getField("field2").getValues()
        .contains("val2"));
    assertTrue("test if doc has field4", doc.getField("field4").getValues()
        .contains("val4"));
  }
View Full Code Here

  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }
View Full Code Here

      LOG.info("contentType: " + contentType);
    }

    ParseResult parseResult = new ParseUtil(conf).parse(content);
   
    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }

    for (String fname : doc.getFieldNames()) {
      List<Object> values = doc.getField(fname).getValues();
      if (values != null) {
        for (Object value : values) {
          String str = value.toString();
          int minText = Math.min(100, str.length());
          System.out.println(fname + " :\t" + str.substring(0, minText));
View Full Code Here

  public void testRelTagFields() throws Exception {
    Configuration conf = NutchConfiguration.create();
    RelTagIndexingFilter filter = new RelTagIndexingFilter();
    filter.setConf(conf);
    assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    WebPage page = new WebPage();
    byte[] bytes = new byte[10];
    ByteBuffer bbuf = ByteBuffer.wrap(bytes);
    page.putToMetadata(new Utf8(RelTagParser.REL_TAG), bbuf);
    try {
      filter.filter(doc, "http://nutch.apache.org/", page);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
    assertNotNull(doc);
    assertTrue("check for 'tag' field", doc.getFieldNames().contains("tag"));
  }
View Full Code Here

  public void testBasicFields() throws Exception {
  Configuration conf = NutchConfiguration.create();
  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  WebPage page = new WebPage();
  page.putToInlinks(new Utf8("http://nutch.apache.org/"), new Utf8("Welcome to Nutch"));
  page.setTitle(new Utf8("Welcome to Nutch"));
    page.setReprUrl(new Utf8("http://www.urldoesnotmatter.org"));
    byte[] bytes = new byte[10];
    ByteBuffer bbuf = ByteBuffer.wrap(bytes);
    page.putToMetadata(Nutch.CACHING_FORBIDDEN_KEY_UTF8, bbuf);
    page.setFetchTime(System.currentTimeMillis());
  try {
    filter.filter(doc, "http://www.apache.org/", page);
  } catch(Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("check for host field ", doc.getFieldNames().contains("host"));
  assertTrue("check for site field", doc.getFieldNames().contains("site"));
  assertTrue("check for url field", doc.getFieldNames().contains("url"));
  assertTrue("check for orig field", doc.getFieldNames().contains("orig"));
  assertTrue("check for content field", doc.getFieldNames().contains("content"));
  assertTrue("check for title field", doc.getFieldNames().contains("title"));
  assertTrue("check for cache field", doc.getFieldNames().contains("cache"));
  assertTrue("check for tstamp field", doc.getFieldNames().contains("tstamp"));
  }
View Full Code Here

  Configuration conf = NutchConfiguration.create();
  conf.setInt("indexer.max.title.length", 10);
  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  WebPage page = new WebPage();
  page.putToInlinks(new Utf8("http://exceedmaximumtitleurl.org/"), new Utf8("exceeding title site"));
  page.setTitle(new Utf8("This title exceeds maximum characters"));
  try {
    filter.filter(doc, "http://www.apache.org/", page);
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertEquals("assert title field only has 10 characters", 10, doc.getFieldValue("title").length());
  }
View Full Code Here

  public void testDeduplicateAnchor() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
    AnchorIndexingFilter filter = new AnchorIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = new NutchDocument();
    WebPage page = new WebPage();
    page.putToInlinks(new Utf8("http://example1.com/"), new Utf8("cool site"));
    page.putToInlinks(new Utf8("http://example2.com/"), new Utf8("cool site"));
    page.putToInlinks(new Utf8("http://example3.com/"), new Utf8("fun site"));
    filter.filter(doc, "http://myurldoesnotmatter.com/", page);
   
    assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
   
    assertEquals("test dedup, we expect 2", 2, doc.getFieldValues("anchor").size());
  }
View Full Code Here

      LOG.info("contentType: " + contentType);
    }

    ParseResult parseResult = new ParseUtil(conf).parse(content);

    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }

    if (doc == null) {
      System.out.println("Document discarded by indexing filter");
      return 0;
    }
   
    for (String fname : doc.getFieldNames()) {
      List<Object> values = doc.getField(fname).getValues();
      if (values != null) {
        for (Object value : values) {
          String str = value.toString();
          int minText = Math.min(100, str.length());
          System.out.println(fname + " :\t" + str.substring(0, minText));
View Full Code Here

    BasicIndexingFilter filter = new BasicIndexingFilter();
    filter.setConf(conf);
    assertNotNull(filter);

    NutchDocument doc = new NutchDocument();

    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);

    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);

    Inlinks inlinks = new Inlinks();

    try {
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    } catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
    }
    assertNotNull(doc);
    assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
    assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
    assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
    assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html",
      doc.getField("url").getValues().get(0));
    assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
    assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.indexer.NutchDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.