Package net.yacy.cora.protocol.http

Examples of net.yacy.cora.protocol.http.HTTPClient


            // generate request-url
            final String connectHost = hostPart(host, port, yAddress);
            final String getUrl = "http://"+ connectHost + remotePath;
            if (log.isFinest()) log.logFinest(reqID +"    using url: "+ getUrl);

            final HTTPClient client = setupHttpClient(requestHeader, connectHost);

            // send request
//            try {
//            res = client.HEAD(getUrl);
//            if (log.isFinest()) log.logFinest(reqID +"    response status: "+ res.getStatusLine());
            client.HEADResponse(getUrl);
            if (log.isFinest()) log.logFinest(reqID +"    response status: "+ client.getHttpResponse().getStatusLine());

            // determine if it's an internal error of the httpc
//            final ResponseHeader responseHeader = res.getResponseHeader();
//            if (responseHeader.isEmpty()) {
//                throw new Exception(res.getStatusLine());
//            }
            final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders());
            if (responseHeader.isEmpty()) {
                throw new Exception(client.getHttpResponse().getStatusLine().toString());
            }

//            prepareResponseHeader(responseHeader, res.getHttpVer());
            prepareResponseHeader(responseHeader, client.getHttpResponse().getStatusLine().getProtocolVersion().toString());

            // sending the server respond back to the client
            if (log.isFinest()) log.logFinest(reqID +"    sending response header: "+ responseHeader);
//            HTTPDemon.sendRespondHeader(conProp,respond,httpVer,res.getStatusCode(),res.getStatusLine().substring(4),responseHeader);
            HTTPDemon.sendRespondHeader(
                conProp,
                respond,
                httpVer,
                client.getHttpResponse().getStatusLine().getStatusCode(),
                client.getHttpResponse().getStatusLine().toString(),
                responseHeader);
            respond.flush();
//            } finally {
//                if(res != null) {
//                    // ... close connection
View Full Code Here


            // the CONTENT_LENGTH will be added by entity and cause a ClientProtocolException if set
            final int contentLength = requestHeader.getContentLength();
            requestHeader.remove(HeaderFramework.CONTENT_LENGTH);

            final HTTPClient client = setupHttpClient(requestHeader, connectHost);

            // check input
            if(body == null) {
                log.logSevere("no body to POST!");
            }
            try {
              // sending the request
              client.POST(getUrl, body, contentLength);
              if (log.isFinest()) log.logFinest(reqID +"    response status: "+ client.getHttpResponse().getStatusLine());

              final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders());
              // determine if it's an internal error of the httpc
              if (responseHeader.isEmpty()) {
                throw new Exception(client.getHttpResponse().getStatusLine().toString());
              }

              final ChunkedOutputStream chunked = setTransferEncoding(conProp, responseHeader, client.getHttpResponse().getStatusLine().getStatusCode(), countedRespond);

              prepareResponseHeader(responseHeader, client.getHttpResponse().getProtocolVersion().toString());

              // sending the respond header back to the client
              if (chunked != null) {
                  responseHeader.put(HeaderFramework.TRANSFER_ENCODING, "chunked");
              }

              // sending response headers
              if (log.isFinest()) log.logFinest(reqID +"    sending response header: "+ responseHeader);
              HTTPDemon.sendRespondHeader(conProp,
                        countedRespond,
                        httpVer,
                        client.getHttpResponse().getStatusLine().getStatusCode(),
                        client.getHttpResponse().getStatusLine().toString(), // status text
                        responseHeader);

              final OutputStream outStream = (chunked != null) ? chunked : countedRespond;
              client.writeTo(outStream);

              if (chunked != null) {
                  chunked.finish();
              }
              outStream.flush();
            } catch(final SocketException se) {
          // connection closed by client, abort download
              client.finish();
            } finally {
              client.finish();
            }
        } catch (final Exception e) {
            handleProxyException(e,conProp,countedRespond,url);
        } finally {
            if(countedRespond != null) {
View Full Code Here

     * @param connectHost may be 'host:port' or 'host:port/path'
     * @return
     */
    private static HTTPClient setupHttpClient(final RequestHeader requestHeader, final String connectHost) {
        // setup HTTP-client
      final HTTPClient client = new HTTPClient();
      client.setTimout(timeout);
      client.setHeader(requestHeader.entrySet());
      client.setRedirecting(false);
        return client;
    }
View Full Code Here

            return;
        }

        // possibly branch into PROXY-PROXY connection
        if (ProxySettings.use && ProxySettings.use4ssl) {
          final HTTPClient remoteProxy = setupHttpClient(requestHeader, host);

            try {
              remoteProxy.HEADResponse("http://" + host + ":" + port);
              final ResponseHeader header = new ResponseHeader(remoteProxy.getHttpResponse().getAllHeaders());

                // outputs a logline to the serverlog with the current status
              log.logInfo("CONNECT-RESPONSE: status=" + remoteProxy.getHttpResponse().getStatusLine() + ", header=" + header.toString());
              final boolean success = remoteProxy.getHttpResponse().getStatusLine().getStatusCode() >= 200 && remoteProxy.getHttpResponse().getStatusLine().getStatusCode() <= 399;
                if (success) {
                    // replace connection details
                    host = ProxySettings.host;
                    port = ProxySettings.port;
                    // go on (see below)
                } else {
                    // pass error response back to client
                  HTTPDemon.sendRespondHeader(
                      conProp,
                      clientOut,
                      httpVersion,
                      remoteProxy.getHttpResponse().getStatusLine().getStatusCode(),
                      remoteProxy.getHttpResponse().getStatusLine().toString(),
                      header);
                    //respondHeader(clientOut, response.status, response.responseHeader);
                    forceConnectionClose(conProp);
                    return;
                }
View Full Code Here

   
    public static SitemapReader parse(final DigestURI sitemapURL) throws IOException {
        // download document
        final RequestHeader requestHeader = new RequestHeader();
        requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
        final HTTPClient client = new HTTPClient();
        client.setTimout(5000);
        client.setHeader(requestHeader.entrySet());
        try {
            client.GET(sitemapURL.toString());
            if (client.getStatusCode() != 200) {
                throw new IOException("Unable to download the sitemap file " + sitemapURL +
                        "\nServer returned status: " + client.getHttpResponse().getStatusLine());
            }
   
            // get some metadata
            final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
            final String contentMimeType = header.mime();
   
            InputStream contentStream = client.getContentstream();
            if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
                contentStream = new GZIPInputStream(contentStream);
            }
            final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
            return sitemapParser.parse(counterStream);
        } catch (IOException e) {
            throw e;
        } finally {
            client.finish();
        }
    }
View Full Code Here

        // send 'wget' to web interface
        final RequestHeader requestHeader = new RequestHeader();
        requestHeader.put(RequestHeader.AUTHORIZATION, "realm=" + encodedPassword); // for http-authentify
//        final Client con = new Client(10000, requestHeader);
        final HTTPClient con = new HTTPClient();
        con.setHeader(requestHeader.entrySet());
//        ResponseContainer res = null;
        try {
//            res = con.GET("http://localhost:"+ port +"/" + path);
            con.GETbytes("http://localhost:"+ port +"/" + path);

            // read response
//            if (res.getStatusLine().startsWith("2")) {
            if (con.getStatusCode() > 199 && con.getStatusCode() < 300) {
                Log.logConfig("COMMAND-STEERING", "YACY accepted steering command: " + processdescription);
//                final ByteArrayOutputStream bos = new ByteArrayOutputStream(); //This is stream is not used???
//                try {
//                    FileUtils.copyToStream(new BufferedInputStream(res.getDataAsStream()), new BufferedOutputStream(bos));
//                } finally {
//                    res.closeStream();
//                }
            } else {
//                Log.logSevere("COMMAND-STEERING", "error response from YACY socket: " + res.getStatusLine());
              Log.logSevere("COMMAND-STEERING", "error response from YACY socket: " + con.getHttpResponse().getStatusLine());
                System.exit(-1);
            }
        } catch (final IOException e) {
            Log.logSevere("COMMAND-STEERING", "could not establish connection to YACY socket: " + e.getMessage());
            System.exit(-1);
View Full Code Here

            for (String netdef: uris) {
                netdef = netdef.trim();
                try {
                    final RequestHeader reqHeader = new RequestHeader();
                    reqHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
                    final HTTPClient client = new HTTPClient();
                    client.setHeader(reqHeader.entrySet());
                    byte[] data = client.GETbytes(uri);
                    if (data == null || data.length == 0) continue;
                    // save locally in case next fetch fails
                    if (file != null) {
                      FileOutputStream f = new FileOutputStream(file);
                      f.write(data);
View Full Code Here

        }

        // setup http-client
        //TODO: adding Traffic statistic for robots download?
        final HTTPClient client = new HTTPClient();
        client.setHeader(reqHeaders.entrySet());
        try {
            // check for interruption
            if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress.");

            // sending the get request
            robotsTxt = client.GETbytes(robotsURL);
            // statistics:
            if (robotsTxt != null) {
              ByteCount.addAccountCount(ByteCount.CRAWLER, robotsTxt.length);
            }
            final int code = client.getHttpResponse().getStatusLine().getStatusCode();
            final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());

            // check the response status
            if (code > 199 && code < 300) {
              if (!header.mime().startsWith("text/plain")) {
                    robotsTxt = null;
                    log.info("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + header.mime() + "'.");
                } else {

                    // getting some metadata
                  eTag = header.containsKey(HeaderFramework.ETAG)?(header.get(HeaderFramework.ETAG)).trim():null;
                    lastMod = header.lastModified();

                    // if the robots.txt file was not changed we break here
                    if ((eTag != null) && (oldEtag != null) && (eTag.equals(oldEtag))) {
                        if (log.isDebugEnabled()) log.debug("Robots.txt from URL '" + robotsURL + "' was not modified. Abort downloading of new version.");
                        return null;
                    }


                    downloadEnd = System.currentTimeMillis();
                    if (log.isDebugEnabled()) log.debug("Robots.txt successfully loaded from URL '" + robotsURL + "' in " + (downloadEnd-downloadStart) + " ms.");
                }
            } else if (code == 304) {
                return null;
            } else if (code > 299 && code < 400) {
                // getting redirection URL
              String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                if (redirectionUrlString==null) {
                    if (log.isDebugEnabled())
                    log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + client.getHttpResponse().getStatusLine() + "].");
                    robotsTxt = null;
                } else {

                    redirectionUrlString = redirectionUrlString.trim();

                    // generating the new URL object
                    final MultiProtocolURI redirectionUrl = MultiProtocolURI.newURL(robotsURL, redirectionUrlString);

                    // following the redirection
                    if (log.isDebugEnabled()) log.debug("Redirection detected for robots.txt with URL '" + robotsURL + "'." +
                            "\nRedirecting request to: " + redirectionUrl);
                    return downloadRobotsTxt(redirectionUrl,redirectionCount,entry);
                }
            } else if (code == 401 || code == 403) {
                accessCompletelyRestricted = true;
                log.info("Access to Robots.txt not allowed on URL '" + robotsURL + "', redirectionCount = " + redirectionCount); // since this is a strange case we log it all the time
            } else {
              if (log.isDebugEnabled())
                log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + client.getHttpResponse().getStatusLine() + "].");
                robotsTxt = null;
            }
        } catch (final Exception e) {
            throw e;
        }
View Full Code Here

        requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
        requestHeader.put(HeaderFramework.ACCEPT_CHARSET, this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
        requestHeader.put(HeaderFramework.ACCEPT_ENCODING, this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));

        // HTTP-Client
        final HTTPClient client = new HTTPClient();
        client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice
        client.setTimout(this.socketTimeout);
        client.setHeader(requestHeader.entrySet());
            // send request
          final byte[] responseBody = client.GETbytes(url, maxFileSize);
          final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
          final int code = client.getHttpResponse().getStatusLine().getStatusCode();

          if (code > 299 && code < 310) {
            // redirection (content may be empty)
                if (header.containsKey(HeaderFramework.LOCATION)) {
                    // getting redirection URL
                  String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                    redirectionUrlString = redirectionUrlString.trim();

                    if (redirectionUrlString.length() == 0) {
                        this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection header empy", code);
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
                    }

                    // normalizing URL
                    final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));

                    // restart crawling with new url
                    this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + request.url().toString());
                    this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);

                    // if we are already doing a shutdown we don't need to retry crawling
                    if (Thread.currentThread().isInterrupted()) {
                        this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", code);
                        throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
                    }

                    // check if the url was already indexed
                    final String dbname = this.sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
                    if (dbname != null) {
                        this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", code);
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
                    }

                    // retry crawling with new url
                    request.redirectURL(redirectionUrl);
                    return load(request, retryCount - 1, maxFileSize, checkBlacklist);
                } else {
                  // no redirection url provided
                    this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided", code);
                    throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
                }
            } else if (responseBody == null) {
              // no response, reject file
                this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", code);
                throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
          } else if (code == 200 || code == 203) {
                // the transfer is ok

                // we write the new cache entry to file system directly
                final long contentLength = responseBody.length;
                ByteCount.addAccountCount(ByteCount.CRAWLER, contentLength);

                // check length again in case it was not possible to get the length before loading
                if (maxFileSize > 0 && contentLength > maxFileSize) {
                  this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", code);
                  throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
                }

                // create a new cache entry
                final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
                response = new Response(
                        request,
                        requestHeader,
                        header,
                        Integer.toString(code),
                        profile,
                        responseBody
                );

                return response;
          } else {
                // if the response has not the right response type then reject file
              this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", code);
                throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
            }
    }
View Full Code Here

            parts.put("maximumRecords", UTF8.StringBody(Long.toString(maximumRecords)));
            parts.put("verify", cacheStrategy == null ? UTF8.StringBody("false") : UTF8.StringBody(cacheStrategy.toName()));
            parts.put("resource", UTF8.StringBody(global ? "global" : "local"));
            parts.put("nav", UTF8.StringBody("none"));
            // result = HTTPConnector.getConnector(userAgent == null ? MultiProtocolURI.yacybotUserAgent : userAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
            final HTTPClient httpClient = new HTTPClient(userAgent == null ? ClientIdentification.getUserAgent() : userAgent, (int) timeout);
            result = httpClient.POSTbytes(new MultiProtocolURI(rssSearchServiceURL), uri.getHost(), parts, false);

            final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
            if (reader == null) {
                throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
            }
View Full Code Here

TOP

Related Classes of net.yacy.cora.protocol.http.HTTPClient

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.