Examples of net.yacy.kelondro.data.meta.DigestURI

net.yacy.kelondro.data.meta.DigestURI
URI-object providing YaCy-hash computation Hashes for URIs are split in several parts For URIs pointing to resources not globally available, the domainhash-part gets one reserved value


    public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final long maxFileSize) throws IOException, Parser.Failure {


        // load resource
        final Response response = load(request, cacheStrategy, maxFileSize, false);
        final DigestURI url = request.url();
        if (response == null) throw new IOException("no Response for url " + url);


        // if it is still not available, report an error
        if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);

View Full Code Here

            String line;
            while ((line = reader.readLine()) != null) {
                line = line.trim();
                if (line.length() > 0) {
                    try {
                        final DigestURI url = new DigestURI(line);
                        in.put(url);
                    } catch (final InterruptedException e) {
                        Log.logException(e);
                    } catch (final MalformedURLException e) {
                        continue;

View Full Code Here

            String line;
            while ((line = reader.readLine()) != null) {
                line = line.trim();
                if (line.length() > 0) {
                    try {
                        final DigestURI url = new DigestURI(line);
                        hosts.add(url.getHost());
                    } catch (final MalformedURLException e) {
                        continue;
                    }
                }
                count++;

View Full Code Here

            String line;
            while ((line = reader.readLine()) != null) {
                line = line.trim();
                if (line.length() > 0) {
                    try {
                        final DigestURI url = new DigestURI(line);
                        urls.add(url.toNormalform(true, true));
                    } catch (final MalformedURLException e) {
                        continue;
                    }
                }
                count++;

View Full Code Here


        public void run() {
            if (this.cache != null && this.cache.exists()) return;
            try {
                // load from the net
                final Response response = load(request(new DigestURI(this.url), false, true), this.cacheStrategy, this.maxFileSize, true);
                final byte[] b = response.getContent();
                if (this.cache != null) FileUtils.copy(b, this.cache);
            } catch (final MalformedURLException e) {} catch (final IOException e) {}
        }

View Full Code Here

        }


        @Override
        public void run() {
            try {
                DigestURI url;
                final Pattern p = Pattern.compile("~|\\(|\\)|\\+|-|@|:|%|\\.|;|_");
                while (true) {
                    try {
                        url = this.in.take();
                        if (url == poison) break;
                        update(patternMinus.matcher(url.getHost()).replaceAll("\\.").split("\\."));
                        update(p.matcher(url.getPath()).replaceAll("/").split("/"));
                    } catch (final InterruptedException e) {
                        Log.logException(e);
                    }
                }
            } catch (final Exception e) {

View Full Code Here

    public DigestURI referrerURL() {
        if (requestHeader == null) return null;
        try {
            String r = requestHeader.get(RequestHeader.REFERER, null);
            if (r == null) return null;
            return new DigestURI(r);
        } catch (final Exception e) {
            return null;
        }
    }

View Full Code Here

    public byte[] referrerHash() {
        if (requestHeader == null) return null;
        String u = requestHeader.get(RequestHeader.REFERER, "");
        if (u == null || u.length() == 0) return null;
        try {
            return new DigestURI(u).hash();
        } catch (final Exception e) {
            return null;
        }
    }

View Full Code Here

    private void insertEntry(final Row.Entry entry) throws IOException {
        final String urlstring = entry.getColString(2);
        if (urlstring == null) throw new IOException ("url string is null");
        this.initiator = entry.getColBytes(1, true);
        this.initiator = (initiator == null) ? null : ((initiator.length == 0) ? null : initiator);
        this.url = new DigestURI(urlstring, entry.getPrimaryKeyBytes());
        this.refhash = (entry.empty(3)) ? null : entry.getColBytes(3, true);
        this.name = (entry.empty(4)) ? "" : entry.getColString(4).trim();
        this.appdate = entry.getColLong(5);
        this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6).trim();
        this.depth = (int) entry.getColLong(7);

View Full Code Here

        if (retryCount < 0) {
            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection counter exceeded", -1);
            throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
        }
        
        DigestURI url = request.url();
        
        final String host = url.getHost();
        if (host == null || host.length() < 2) throw new IOException("host is not well-formed: '" + host + "'");
        final String path = url.getFile();
        int port = url.getPort();
        final boolean ssl = url.getProtocol().equals("https");
        if (port < 0) port = (ssl) ? 443 : 80;
        
        // check if url is in blacklist
        final String hostlow = host.toLowerCase();
        if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
            throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
        }
        
        // resolve yacy and yacyh domains
        AlternativeDomainNames yacyResolver = HTTPDemon.getAlternativeResolver();
        if(yacyResolver != null) {
          String yAddress = yacyResolver.resolve(host);
          if(yAddress != null) {
            url = new DigestURI(url.getProtocol() + "://" + yAddress + path);
          }
        }
        
        // take a file from the net
        Response response = null;
        
        // create a request header
        final RequestHeader requestHeader = new RequestHeader();
        requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
        DigestURI refererURL = null;
        if (request.referrerhash() != null) refererURL = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
        if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
        requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
        requestHeader.put(HeaderFramework.ACCEPT_CHARSET, sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
        requestHeader.put(HeaderFramework.ACCEPT_ENCODING, sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));


        // HTTP-Client
        final HTTPClient client = new HTTPClient();
        client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice
        client.setTimout(socketTimeout);
        client.setHeader(requestHeader.entrySet());
            // send request
          final byte[] responseBody = client.GETbytes(url, maxFileSize);
          final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
          final int code = client.getHttpResponse().getStatusLine().getStatusCode();


          if (code > 299 && code < 310) {
            // redirection (content may be empty)
                if (header.containsKey(HeaderFramework.LOCATION)) {
                    // getting redirection URL
                  String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                    redirectionUrlString = redirectionUrlString.trim();


                    if (redirectionUrlString.length() == 0) {
                        sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection header empy", code);
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
                    }
                    
                    // normalizing URL
                    final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));


                    // restart crawling with new url
                    this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + request.url().toString());
                    this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);


                    // if we are already doing a shutdown we don't need to retry crawling
                    if (Thread.currentThread().isInterrupted()) {
                        sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", code);
                        throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
                    }
                    
                    // check if the url was already indexed
                    final String dbname = sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
                    if (dbname != null) {
                        sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", code);
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
                    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of net.yacy.kelondro.data.meta.DigestURI

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.