Package edu.uci.ics.crawler4j.robotstxt

Source Code of edu.uci.ics.crawler4j.robotstxt.RobotstxtServer

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package edu.uci.ics.crawler4j.robotstxt;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.http.HttpStatus;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.Util;

/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class RobotstxtServer {

  protected RobotstxtConfig config;

  protected final Map<String, HostDirectives> host2directivesCache = new HashMap<String, HostDirectives>();

  protected PageFetcher pageFetcher;

  public RobotstxtServer(RobotstxtConfig config, PageFetcher pageFetcher) {
    this.config = config;
    this.pageFetcher = pageFetcher;
  }

  public boolean allows(WebURL webURL) {
    if (!config.isEnabled()) {
      return true;
    }
    try {
      URL url = new URL(webURL.getURL());
      String host = url.getHost().toLowerCase();
      String path = url.getPath();

      HostDirectives directives = host2directivesCache.get(host);

            if (directives != null && directives.needsRefetch()) {
                synchronized (host2directivesCache) {
                    host2directivesCache.remove(host);
                    directives = null;
                }
            }

      if (directives == null) {
        directives = fetchDirectives(host);
      }
      return directives.allows(path);
    } catch (MalformedURLException e) {
      e.printStackTrace();
    }
    return true;
  }

  private HostDirectives fetchDirectives(String host) {
    WebURL robotsTxtUrl = new WebURL();
    robotsTxtUrl.setURL("http://" + host + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(robotsTxtUrl);
        fetchResult.fetchContent(page);
        if (Util.hasPlainTextContent(page.getContentType())) {
          try {
            String content;
            if (page.getContentCharset() == null) {
              content = new String(page.getContentData());
            } else {
              content = new String(page.getContentData(), page.getContentCharset());
            }
            directives = RobotstxtParser.parse(content, config.getUserAgentName());
          } catch (Exception e) {
            e.printStackTrace();
          }
        }
      }
    } finally {
      fetchResult.discardContentIfNotConsumed();
    }
    if (directives == null) {
      // We still need to have this object to keep track of the time we
      // fetched it
      directives = new HostDirectives();
    }
    synchronized (host2directivesCache) {
      if (host2directivesCache.size() == config.getCacheSize()) {
        String minHost = null;
        long minAccessTime = Long.MAX_VALUE;
        for (Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) {
          if (entry.getValue().getLastAccessTime() < minAccessTime) {
            minAccessTime = entry.getValue().getLastAccessTime();
            minHost = entry.getKey();
          }
        }
        host2directivesCache.remove(minHost);
      }
      host2directivesCache.put(host, directives);
    }
    return directives;
  }

}
TOP

Related Classes of edu.uci.ics.crawler4j.robotstxt.RobotstxtServer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.