Package org.apache.nutch.protocol.http.api

Source Code of org.apache.nutch.protocol.http.api.TestRobotRulesParser

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.protocol.http.api;

import org.apache.nutch.protocol.http.api.RobotRulesParser.RobotRuleSet;

import junit.framework.TestCase;

public class TestRobotRulesParser extends TestCase {
  private static final String LF= "\n";
  private static final String CR= "\r";
  private static final String CRLF= "\r\n";
 
  private static final boolean[] ACCEPT_ALL = {
    true,   // "/a",       
    true,   // "/a/",       
    true,   // "/a/bloh/foo.html"
    true,   // "/b",       
    true,   // "/b/a",       
    true,   // "/b/a/index.html",
    true,   // "/b/b/foo.html", 
    true,   // "/c",       
    true,   // "/c/a",       
    true,   // "/c/a/index.html",
    true,   // "/c/b/foo.html", 
    true,   // "/d",       
    true,   // "/d/a",       
    true,   // "/e/a/index.html",
    true,   // "/e/d",       
    true,   // "/e/d/foo.html", 
    true,   // "/e/doh.html",   
    true,   // "/f/index.html", 
    true,   // "/foo/bar.html", 
    true,   // "/f/",
  };
 
  private static final String[] ROBOTS_STRINGS= new String[] {
    "User-Agent: Agent1 #foo" + CR
    + "Disallow: /a" + CR
    + "Disallow: /b/a" + CR
    + "#Disallow: /c" + CR
    + "" + CR
    + "" + CR
    + "User-Agent: Agent2 Agent3#foo" + CR
    + "User-Agent: Agent4" + CR
    + "Disallow: /d" + CR
    + "Disallow: /e/d/" + CR
    + "" + CR
    + "User-Agent: *" + CR
    + "Disallow: /foo/bar/" + CR,
    null  // Used to test EMPTY_RULES
  };

  private static final String[] AGENT_STRINGS= new String[] {
    "Agent1",
    "Agent2",
    "Agent3",
    "Agent4",
    "Agent5",
  };

  private static final boolean[][] NOT_IN_ROBOTS_STRING= new boolean[][] {
    {
      false,
      false,
      false,
      false,
      true,
    },
    {
      false,
      false,
      false,
      false,
      true,
    }   
  };

  private static final String[] TEST_PATHS= new String[] {
    "/a",
    "/a/",
    "/a/bloh/foo.html",
    "/b",
    "/b/a",
    "/b/a/index.html",
    "/b/b/foo.html",
    "/c",
    "/c/a",
    "/c/a/index.html",
    "/c/b/foo.html",
    "/d",
    "/d/a",
    "/e/a/index.html",
    "/e/d",
    "/e/d/foo.html",
    "/e/doh.html",
    "/f/index.html",
    "/foo/bar/baz.html"
    "/f/",
  };

  private static final boolean[][][] ALLOWED= new boolean[][][] {
    { // ROBOTS_STRINGS[0]
      { // Agent1
  false,  // "/a",       
  false,  // "/a/",       
  false,  // "/a/bloh/foo.html"
  true,   // "/b",       
  false,  // "/b/a",       
  false,  // "/b/a/index.html",
  true,   // "/b/b/foo.html", 
  true,   // "/c",       
  true,   // "/c/a",       
  true,   // "/c/a/index.html",
  true,   // "/c/b/foo.html", 
  true,   // "/d",       
  true,   // "/d/a",       
  true,   // "/e/a/index.html",
  true,   // "/e/d",       
  true,   // "/e/d/foo.html", 
  true,   // "/e/doh.html",   
  true,   // "/f/index.html", 
  true,   // "/foo/bar.html", 
  true,   // "/f/", 
      },
      { // Agent2
  true,   // "/a",       
  true,   // "/a/",       
  true,   // "/a/bloh/foo.html"
  true,   // "/b",       
  true,   // "/b/a",       
  true,   // "/b/a/index.html",
  true,   // "/b/b/foo.html", 
  true,   // "/c",       
  true,   // "/c/a",       
  true,   // "/c/a/index.html",
  true,   // "/c/b/foo.html", 
  false,  // "/d",       
  false,  // "/d/a",       
  true,   // "/e/a/index.html",
  true,   // "/e/d",       
  false,  // "/e/d/foo.html", 
  true,   // "/e/doh.html",   
  true,   // "/f/index.html", 
  true,   // "/foo/bar.html", 
  true,   // "/f/", 
      },
      { // Agent3
  true,   // "/a",       
  true,   // "/a/",       
  true,   // "/a/bloh/foo.html"
  true,   // "/b",       
  true,   // "/b/a",       
  true,   // "/b/a/index.html",
  true,   // "/b/b/foo.html", 
  true,   // "/c",       
  true,   // "/c/a",       
  true,   // "/c/a/index.html",
  true,   // "/c/b/foo.html", 
  false,  // "/d",       
  false,  // "/d/a",       
  true,   // "/e/a/index.html",
  true,   // "/e/d",       
  false,  // "/e/d/foo.html", 
  true,   // "/e/doh.html",   
  true,   // "/f/index.html", 
  true,   // "/foo/bar.html", 
  true,   // "/f/", 
      },
      { // Agent4
  true,   // "/a",       
  true,   // "/a/",       
  true,   // "/a/bloh/foo.html"
  true,   // "/b",       
  true,   // "/b/a",       
  true,   // "/b/a/index.html",
  true,   // "/b/b/foo.html", 
  true,   // "/c",       
  true,   // "/c/a",       
  true,   // "/c/a/index.html",
  true,   // "/c/b/foo.html", 
  false,  // "/d",       
  false,  // "/d/a",       
  true,   // "/e/a/index.html",
  true,   // "/e/d",       
  false,  // "/e/d/foo.html", 
  true,   // "/e/doh.html",   
  true,   // "/f/index.html", 
  true,   // "/foo/bar.html", 
  true,   // "/f/", 
      },
      { // Agent5/"*"
  true,   // "/a",       
  true,   // "/a/",       
  true,   // "/a/bloh/foo.html"
  true,   // "/b",       
  true,   // "/b/a",       
  true,   // "/b/a/index.html",
  true,   // "/b/b/foo.html", 
  true,   // "/c",       
  true,   // "/c/a",       
  true,   // "/c/a/index.html",
  true,   // "/c/b/foo.html", 
  true,   // "/d",       
  true,   // "/d/a",       
  true,   // "/e/a/index.html",
  true,   // "/e/d",       
  true,   // "/e/d/foo.html", 
  true,   // "/e/doh.html",   
  true,   // "/f/index.html", 
  false,  // "/foo/bar.html", 
  true,   // "/f/", 
      }
    },
    { // ROBOTS_STRINGS[1]
      ACCEPT_ALL, // Agent 1
      ACCEPT_ALL, // Agent 2
      ACCEPT_ALL, // Agent 3
      ACCEPT_ALL, // Agent 4
      ACCEPT_ALL, // Agent 5
    }
  };
  public TestRobotRulesParser(String name) {
    super(name);
  }

  public void testRobotsOneAgent() {
    for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
      for (int j= 0; j < AGENT_STRINGS.length; j++) {
  testRobots(i, new String[] { AGENT_STRINGS[j] },
       TEST_PATHS, ALLOWED[i][j]);
      }
    }
  }

  public void testRobotsTwoAgents() {
    for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
      for (int j= 0; j < AGENT_STRINGS.length; j++) {
  for (int k= 0; k < AGENT_STRINGS.length; k++) {
    int key= j;
    if (NOT_IN_ROBOTS_STRING[i][j])
      key= k;
    testRobots(i, new String[] { AGENT_STRINGS[j], AGENT_STRINGS[k] },
         TEST_PATHS, ALLOWED[i][key]);
  }
      }
    }
  }
 
  public void testCrawlDelay() {
    RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });
    String delayRule1 = "User-agent: nutchbot" + CR +
                        "Crawl-delay: 10" + CR +
                        "User-agent: foobot" + CR +
                        "Crawl-delay: 20" + CR +
                        "User-agent: *" + CR +
                        "Disallow:/baz" + CR;
    String delayRule2 = "User-agent: foobot" + CR +
                        "Crawl-delay: 20" + CR +
                        "User-agent: *" + CR +
                        "Disallow:/baz" + CR;
    RobotRuleSet rules = p.parseRules(delayRule1.getBytes());
    long crawlDelay = rules.getCrawlDelay();
    assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay == 10000));
    rules = p.parseRules(delayRule2.getBytes());
    crawlDelay = rules.getCrawlDelay();
    assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay == -1));
  }

  // helper

  public void testRobots(int robotsString, String[] agents, String[] paths,
       boolean[] allowed) {
    String agentsString= agents[0];
    for (int i= 1; i < agents.length; i++)
      agentsString= agentsString + "," + agents[i];
    RobotRulesParser p= new RobotRulesParser(agents);
    RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null
                                     ? ROBOTS_STRINGS[robotsString].getBytes()
                                     : null);
    for (int i= 0; i < paths.length; i++) {
      assertTrue("testing robots file "+robotsString+", on agents ("
     + agentsString + "), and path " + TEST_PATHS[i] + "; got "
     + rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF
           + rules,
     rules.isAllowed(TEST_PATHS[i]) == allowed[i]);
    }
  }


 
}
TOP

Related Classes of org.apache.nutch.protocol.http.api.TestRobotRulesParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.