Package org.archive.modules.net

Examples of org.archive.modules.net.CrawlServer


*/
public class CrawlServerTest extends TestCase {

   
    public void testSerialization() throws Exception {
        TestUtils.testSerialization(new CrawlServer("hi"));
    }
View Full Code Here


    }
   
    @Override
    protected void innerProcess(CrawlURI curi) {
        // Tally per-server, per-host, per-frontier-class running totals
        CrawlServer server = serverCache.getServerFor(curi.getUURI());

        String scheme = curi.getUURI().getScheme().toLowerCase();
        if (scheme.equals("http") || scheme.equals("https") &&
                server != null) {
            // Update connection problems counter
            if(curi.getFetchStatus() == S_CONNECT_FAILED || curi.getFetchStatus() == S_CONNECT_LOST ) {
                server.incrementConsecutiveConnectionErrors();
            } else if (curi.getFetchStatus() > 0){
                server.resetConsecutiveConnectionErrors();
            }

            // Update robots info
            try {
                if ("/robots.txt".equals(curi.getUURI().getPath()) && curi.getFetchStatus() != S_DEFERRED) {
                    // shortcut retries  w/ DEEMED when ignore-all
                    if (metadata.getRobotsPolicy() instanceof IgnoreRobotsPolicy) {
                        if(curi.getFetchStatus() < 0 && curi.getFetchStatus()!=S_DEFERRED) {
                            // prevent the rest of the usual retries
                            curi.setFetchStatus(S_DEEMED_NOT_FOUND);
                        }
                    }
                   
                    // Update server with robots info
                    // NOTE: in some cases the curi's status can be changed here
                    server.updateRobots(curi);
                }
            }
            catch (URIException e) {
                logger.severe("Failed get path on " + curi.getUURI());
            }
View Full Code Here

            }
           
            long respectThreshold = getRespectCrawlDelayUpToSeconds() * 1000;
            if (durationToWait<respectThreshold) {
                // may need to extend wait
                CrawlServer s = getServerCache().getServerFor(curi.getUURI());
                String ua = curi.getUserAgent();
                if (ua == null) {
                    ua = metadata.getUserAgent();
                }
                Robotstxt rep = s.getRobotstxt();
                if (rep != null) {
                    long crawlDelay = (long)(1000 * rep.getDirectivesFor(ua).getCrawlDelay());
                    crawlDelay =
                        (crawlDelay > respectThreshold)
                            ? respectThreshold
View Full Code Here

     */
    public boolean rootUriMatch(ServerCache cache,
            CrawlURI curi) {
        String cd = getDomain();

        CrawlServer serv = cache.getServerFor(curi.getUURI());
        String serverName = serv.getName();
//        String serverName = controller.getServerCache().getServerFor(curi).
//            getName();
        logger.fine("RootURI: Comparing " + serverName + " " + cd);
        return cd != null && serverName != null &&
            serverName.equalsIgnoreCase(cd);
View Full Code Here

            // The avatar needs to be added to the server that is dependent
            // on this precondition. Find it by name. Get the name from
            // the credential this avatar represents.
            String cd = c.getDomain();
            if (cd != null) {
                CrawlServer cs = serverCache.getServerFor(cd);
                if (cs != null) {
                    cs.addCredential(c);
                    cs.setHttpAuthChallenges(curi.getHttpAuthChallenges());
                }
            }
        }
    }
View Full Code Here

            // domain. If not, let this curi die. Else, add it to the
            // curi and let it come around again. Add in the AuthScheme
            // we got too. Its needed when we go to run the Auth on
            // second time around.
            String serverKey = getServerKey(curi);
            CrawlServer server = serverCache.getServerFor(serverKey);
            Set<Credential> storeRfc2617Credentials = getCredentialStore().subset(curi,
                    HttpAuthenticationCredential.class, server.getName());
            if (storeRfc2617Credentials == null
                    || storeRfc2617Credentials.size() <= 0) {
                logger.fine("No rfc2617 credentials for " + curi);
            } else {
                HttpAuthenticationCredential found = HttpAuthenticationCredential.getByRealm(
View Full Code Here

        throw new AssertionError();
    }
   
    protected ProcessResult innerProcessResult(CrawlURI puri) {
        CrawlURI curi = (CrawlURI)puri;
        final CrawlServer server = serverCache.getServerFor(curi.getUURI());
        final CrawlHost host = serverCache.getHostFor(curi.getUURI());
        FetchStats.HasFetchStats[] haveStats =
            new FetchStats.HasFetchStats[] {
                server,
                host,
View Full Code Here

            }
        } catch (URIException e) {
            logger.severe("Failed get of path for " + curi);
        }
       
        CrawlServer cs = serverCache.getServerFor(curi.getUURI());
        // require /robots.txt if not present
        if (cs.isRobotsExpired(getRobotsValidityDurationSeconds())) {
          // Need to get robots
            if (logger.isLoggable(Level.FINE)) {
                logger.fine( "No valid robots for " + cs  +
                    "; deferring " + curi);
            }

            // Robots expired - should be refetched even though its already
            // crawled.
            try {
                String prereq = curi.getUURI().resolve("/robots.txt").toString();
                curi.markPrerequisite(prereq);
            }
            catch (URIException e1) {
                logger.severe("Failed resolve using " + curi);
                throw new RuntimeException(e1); // shouldn't ever happen
            }
            return true;
        }
        // test against robots.txt if available
        if (cs.isValidRobots()) {
            String ua = metadata.getUserAgent();
            RobotsPolicy robots = metadata.getRobotsPolicy();
            if(!robots.allows(ua, curi, cs.getRobotstxt())) {
                if(getCalculateRobotsOnly()) {
                    // annotate URI as excluded, but continue to process normally
                    curi.getAnnotations().add("robotExcluded");
                    return false;
                }
View Full Code Here

            return false;
        } else if (curi.getUURI().getScheme().equals("whois")) {
            return false;
        }
       
        CrawlServer cs = serverCache.getServerFor(curi.getUURI());
        if(cs == null) {
            curi.setFetchStatus(S_UNFETCHABLE_URI);
//            curi.skipToPostProcessing();
            return true;
        }
View Full Code Here

                // Han't been authenticated.  Queue it and move on (Assumption
                // is that we can do one authentication at a time -- usually one
                // html form).
                String prereq = c.getPrerequisite(curi);
                if (prereq == null || prereq.length() <= 0) {
                    CrawlServer server = serverCache.getServerFor(curi.getUURI());
                    logger.severe(server.getName() + " has "
                        + " credential(s) of type " + c + " but prereq"
                        + " is null.");
                } else {
                    try {
                        curi.markPrerequisite(prereq);
View Full Code Here

TOP

Related Classes of org.archive.modules.net.CrawlServer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.