Examples of net.yacy.cora.protocol.ResponseHeader

net.yacy.cora.protocol.ResponseHeader

        // loading data from database
        Map<String, String> hdb;
        hdb = responseHeaderDB.get(hash);
        if (hdb == null) return null;
        
        return new ResponseHeader(null, hdb);
    }

View Full Code Here

            // statistics:
            if (robotsTxt != null) {
              ByteCount.addAccountCount(ByteCount.CRAWLER, robotsTxt.length);
            }
            final int code = client.getHttpResponse().getStatusLine().getStatusCode();
            final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
            
            // check the response status
            if (code > 199 && code < 300) {
              if (!header.mime().startsWith("text/plain")) {
                    robotsTxt = null;
                    log.info("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + header.mime() + "'.");
                } else {


                    // getting some metadata
                  eTag = header.containsKey(HeaderFramework.ETAG)?(header.get(HeaderFramework.ETAG)).trim():null;
                    lastMod = header.lastModified();
                    
                    // if the robots.txt file was not changed we break here
                    if ((eTag != null) && (oldEtag != null) && (eTag.equals(oldEtag))) {
                        if (log.isDebugEnabled()) log.debug("Robots.txt from URL '" + robotsURL + "' was not modified. Abort downloading of new version.");
                        return null;
                    }
                    
                    
                    downloadEnd = System.currentTimeMillis();                    
                    if (log.isDebugEnabled()) log.debug("Robots.txt successfully loaded from URL '" + robotsURL + "' in " + (downloadEnd-downloadStart) + " ms.");
                }
            } else if (code == 304) {
                return null;
            } else if (code > 299 && code < 400) {
                // getting redirection URL
              String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                if (redirectionUrlString==null) {
                    if (log.isDebugEnabled())
                    log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + client.getHttpResponse().getStatusLine() + "].");
                    robotsTxt = null;                    
                } else {

View Full Code Here

        final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
        if (crawlProfile != null && cacheStrategy != CacheStrategy.NOCACHE) {
            // we have passed a first test if caching is allowed
            // now see if there is a cache entry


            final ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
            final byte[] content = (cachedResponse == null) ? null : Cache.getContent(url.hash());
            if (cachedResponse != null && content != null) {
                // yes we have the content


                // create request header values and a response object because we need that

View Full Code Here

     * @throws IOException
     */
    public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy) throws IOException {
        final Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE, false);
        if (response == null) throw new IOException("response == null");
        final ResponseHeader responseHeader = response.getResponseHeader();
        byte[] resource = response.getContent();
        if (resource == null) throw new IOException("resource == null");
        if (responseHeader == null) throw new IOException("responseHeader == null");


        Document[] documents = null;
        final String supportError = TextParser.supports(url, responseHeader.mime());
        if (supportError != null) throw new IOException("no parser support: " + supportError);
        try {
            documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), resource.length, new ByteArrayInputStream(resource));
            if (documents == null) throw new IOException("document == null");
        } catch (final Exception e) {
            throw new IOException("parser error: " + e.getMessage());
        } finally {
            resource = null;

View Full Code Here

    
    public Response(final Request request, final CrawlProfile profile) {
        this.request = request;
        // request and response headers may be zero in case that we process surrogates
        this.requestHeader = new RequestHeader();
        this.responseHeader = new ResponseHeader();
        if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
        this.responseStatus = "200";
        this.profile = profile;
        this.status = QUEUE_STATE_FRESH;
        this.content = request.url().toTokens().getBytes();

View Full Code Here

        client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice
        client.setTimout(socketTimeout);
        client.setHeader(requestHeader.entrySet());
            // send request
          final byte[] responseBody = client.GETbytes(url, maxFileSize);
          final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
          final int code = client.getHttpResponse().getStatusLine().getStatusCode();


          if (code > 299 && code < 310) {
            // redirection (content may be empty)
                if (header.containsKey(HeaderFramework.LOCATION)) {
                    // getting redirection URL
                  String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                    redirectionUrlString = redirectionUrlString.trim();


                    if (redirectionUrlString.length() == 0) {
                        sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection header empy", code);
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");

View Full Code Here


        final HTTPClient client = new HTTPClient();
        client.setTimout(20000);
        client.setHeader(requestHeader.entrySet());
          final byte[] responseBody = client.GETbytes(request.url(), Long.MAX_VALUE);
          final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
          final int code = client.getHttpResponse().getStatusLine().getStatusCode();
            // FIXME: 30*-handling (bottom) is never reached
            // we always get the final content because httpClient.followRedirects = true


          if (responseBody != null && (code == 200 || code == 203)) {
                // the transfer is ok
            
            //statistics:
            ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);
                
                // we write the new cache entry to file system directly


                // create a new cache entry
                response = new Response(
                        request,
                        requestHeader,
                        header, 
                        Integer.toString(code),
                        null,
                        responseBody
                );


                return response;
            } else if (code > 299 && code < 310) {
                if (header.containsKey(HeaderFramework.LOCATION)) {
                    // getting redirection URL
                  String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                    redirectionUrlString = redirectionUrlString.trim();


                    if (redirectionUrlString.length() == 0) {
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
                    }

View Full Code Here

                list.add(u + ((u.endsWith("/") || u.endsWith("\\")) ? "" : "/") + s);
            }
         
            StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
            
            ResponseHeader responseHeader = new ResponseHeader();
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request, 
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    content.toString().getBytes());
            
            return response;
        }
        
        // create response header
        String mime = MimeTable.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
        
        // check mime type and availability of parsers
        // and also check resource size and limitation of the size
        long size;
        try {
            size = url.length();
        } catch (Exception e) {
            size = -1;
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
            (size > maxFileSize && maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned
            
            if (parserError != null) {
                log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
                log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
            
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request, 
                    requestHeader,
                    responseHeader,

View Full Code Here

                list.add(u + s);
            }
         
            StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
            
            ResponseHeader responseHeader = new ResponseHeader();
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request, 
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    content.toString().getBytes());
            
            return response;
        }
        
        // create response header
        String mime = MimeTable.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
        
        // check mime type and availability of parsers
        // and also check resource size and limitation of the size
        long size;
        try {
            size = url.length();
        } catch (Exception e) {
            size = -1;
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
            (size > maxFileSize && maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned
            
            if (parserError != null) {
                log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
                log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
            
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request, 
                    requestHeader,
                    responseHeader,

View Full Code Here

                StringBuilder dirList = ftpClient.dirhtml(path);


                if (dirList == null) {
                    response = null;
                } else {
                    ResponseHeader responseHeader = new ResponseHeader();
                    responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
                    responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
                    final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
                    response = new Response(
                            request, 
                            requestHeader,
                            responseHeader,

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of net.yacy.cora.protocol.ResponseHeader

CacheResource_p

CookieTest_p

de.anomic.crawler.retrieval.FileLoader

de.anomic.crawler.retrieval.FTPLoader

de.anomic.crawler.retrieval.HTTPLoader

de.anomic.crawler.retrieval.Response

de.anomic.crawler.retrieval.SMBLoader

de.anomic.crawler.RobotsTxt

de.anomic.http.client.Cache

de.anomic.http.server.HTTPDemon

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.