Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.Outlink


          + "'");

    }
    answerOutlinks = new Outlink[][] {
        // 0
        { new Outlink("http://www.nutch.org", "anchor"), },
        // 1
        {
          new Outlink("http://www.nutch.org/", "home"),
          new Outlink("http://www.nutch.org/docs/bot.html",
                "bots"), },
        // 2
        {
          new Outlink("http://www.nutch.org/", "separate this"),
          new Outlink("http://www.nutch.org/docs/ok", "from this"), },
       
        // 3 
        {   new Outlink("http://www.nutch.org/", "home"),
          new Outlink("http://www.nutch.org/docs/1", "1"),
          new Outlink("http://www.nutch.org/docs/2", "2"), },
        // 4 
        {
          new Outlink("http://www.nutch.org/frames/top.html", ""),
          new Outlink("http://www.nutch.org/frames/left.html", ""),
          new Outlink("http://www.nutch.org/frames/invalid.html",""),
          new Outlink("http://www.nutch.org/frames/right.html",""),
        },
        // 5
        {
          new Outlink("http://www.nutch.org/maps/logo.gif", ""),
          new Outlink("http://www.nutch.org/index.html", ""),
          new Outlink("http://www.nutch.org/maps/#bottom", ""),
          new Outlink("http://www.nutch.org/bot.html", ""),
          new Outlink("http://www.nutch.org/docs/index.html", "")
        },
        // 6
        { new Outlink("http://www.nutch.org/index.html",
            "whitespace test"),
        },
        // 7
        {},
        // 8
        { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
        // 9
        {},
        // 10
        {
         new Outlink("http://www.nutch.org/;x", "anchor1"),
         new Outlink("http://www.nutch.org/g;x", "anchor2"),
         new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
        },
        // 11
        {
         new Outlink("http://www.nutch.org/g;something","anchor1"),
         new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"),
         new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
         new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
         new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5") }
        };

  }
View Full Code Here


      testDOMs[i]= node;
    }
    try {
     answerOutlinks = new Outlink[][]{
         {
           new Outlink("http://www.nutch.org", "anchor"),
         },
         {
           new Outlink("http://www.nutch.org/", "home"),
           new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
         },
         {
           new Outlink("http://www.nutch.org/", "separate this"),
           new Outlink("http://www.nutch.org/docs/ok", "from this"),
         },
         {
           new Outlink("http://www.nutch.org/", "home"),
           new Outlink("http://www.nutch.org/docs/1", "1"),
           new Outlink("http://www.nutch.org/docs/2", "2"),
         },
         {
           new Outlink("http://www.nutch.org/frames/top.html", ""),
           new Outlink("http://www.nutch.org/frames/left.html", ""),
           new Outlink("http://www.nutch.org/frames/invalid.html", ""),
           new Outlink("http://www.nutch.org/frames/right.html", ""),
         },
         {
           new Outlink("http://www.nutch.org/maps/logo.gif", ""),
           new Outlink("http://www.nutch.org/index.html", ""),
           new Outlink("http://www.nutch.org/maps/#bottom", ""),
           new Outlink("http://www.nutch.org/bot.html", ""),
           new Outlink("http://www.nutch.org/docs/index.html", ""),
         },
         {
             new Outlink("http://www.nutch.org/index.html", "whitespace test"),
         },
         {
         },
         {
           new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
         },
         {
         },
         {
           new Outlink("http://www.nutch.org/;x", "anchor1"),
           new Outlink("http://www.nutch.org/g;x", "anchor2"),
           new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
         },
         {
           // this is tricky - see RFC3986 section 5.4.1 example 7
           new Outlink("http://www.nutch.org/g", "anchor1"),
           new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
           new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
           new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
           new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
         },
         {
           new Outlink("http://www.nutch.org/g", ""),
           new Outlink("http://www.nutch.org/g1", ""),
           new Outlink("http://www.nutch.org/g2", "bla bla"),
           new Outlink("http://www.nutch.org/test.gif", "bla bla"),
         }
      };
  
    } catch (MalformedURLException e) {
       
View Full Code Here

        assertTrue("caught exception: " + e, false);
      }
      testDOMs[i] = node;
    }
    answerOutlinks = new Outlink[][] {
        { new Outlink("http://www.nutch.org", "anchor"), },
        { new Outlink("http://www.nutch.org/", "home"),
            new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
        { new Outlink("http://www.nutch.org/", "separate this"),
            new Outlink("http://www.nutch.org/docs/ok", "from this"), },
        { new Outlink("http://www.nutch.org/", "home"),
            new Outlink("http://www.nutch.org/docs/1", "1"),
            new Outlink("http://www.nutch.org/docs/2", "2"), },
        { new Outlink("http://www.nutch.org/frames/top.html", ""),
            new Outlink("http://www.nutch.org/frames/left.html", ""),
            new Outlink("http://www.nutch.org/frames/invalid.html", ""),
            new Outlink("http://www.nutch.org/frames/right.html", ""), },
        { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
            new Outlink("http://www.nutch.org/index.html", ""),
            new Outlink("http://www.nutch.org/maps/#bottom", ""),
            new Outlink("http://www.nutch.org/bot.html", ""),
            new Outlink("http://www.nutch.org/docs/index.html", ""), },
        { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
        {},
        { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
        {},
        { new Outlink("http://www.nutch.org/;x", "anchor1"),
            new Outlink("http://www.nutch.org/g;x", "anchor2"),
            new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
        {
            // this is tricky - see RFC3986 section 5.4.1 example 7
            new Outlink("http://www.nutch.org/g", "anchor1"),
            new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
            new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
            new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
            new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
                "anchor5") } };

  }
View Full Code Here

        Map<String, String> outlinkMap = new LinkedHashMap<String, String>();

        // normalize urls and put into map
        if (outlinkAr != null && outlinkAr.length > 0) {
          for (int i = 0; i < outlinkAr.length; i++) {
            Outlink outlink = outlinkAr[i];
            String toUrl = normalizeUrl(outlink.getToUrl());

            if (filterUrl(toUrl) == null) {
              continue;
            }

            // only put into map if the url doesn't already exist in the map or
            // if it does and the anchor for that link is null, will replace if
            // url is existing
            boolean existingUrl = outlinkMap.containsKey(toUrl);
            if (toUrl != null
                && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
              outlinkMap.put(toUrl, outlink.getAnchor());
            }
          }
        }

        // collect the outlinks under the fetch time
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.Outlink

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.