Examples of net.yacy.kelondro.data.meta.DigestURI

net.yacy.kelondro.data.meta.DigestURI
URI-object providing YaCy-hash computation Hashes for URIs are split in several parts For URIs pointing to resources not globally available, the domainhash-part gets one reserved value

        return new EntryIterator();
    }
    
    public ArrayList<ZURL.Entry> list(int max) {
        ArrayList<ZURL.Entry> l = new ArrayList<ZURL.Entry>();
        DigestURI url;
        for (ZURL.Entry entry: this) {
            if (entry == null) continue;
            url = entry.url();
            if (url == null) continue;
            l.add(entry);

View Full Code Here

     * test and benchmark
     * @param args
     */
    public static void main(final String[] args) {
        try {
            final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/");
            final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0f, 0.0f, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0);
            EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
            System.out.println("valid test:\n=======");
            // add
            stack(urlRef, urlRef.hash(), url.hash(), stackNo);
            // size
            System.out.println("size of stack:\t"+ getStackSize(stackNo));
        } catch (final MalformedURLException e) {
            Log.logException(e);
        }

View Full Code Here


    public void process(sitemapParser.URLEntry entry) {


        // get the url hash
        byte[] nexturlhash = null;
        DigestURI url = null;
        try {
            url = new DigestURI(entry.url());
            nexturlhash = url.hash();
        } catch (final MalformedURLException e1) {
        }


        // check if the url is known and needs to be recrawled
        Date lastMod = entry.lastmod(null);

View Full Code Here

    private void load(final Request urlEntry, final String stats, final String profileHandle) {
        final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle));
        if (profile != null) {


            // check if the protocol is supported
            final DigestURI url = urlEntry.url();
            final String urlProtocol = url.getProtocol();
            if (this.sb.loader.isSupportedProtocol(urlProtocol)) {
                if (this.log.isFine())
                    this.log.logFine(stats + ": URL=" + urlEntry.url()
                            + ", initiator=" + ((urlEntry.initiator() == null) ? "" : ASCII.String(urlEntry.initiator()))
                            + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false")
                            + ", depth=" + urlEntry.depth()
                            + ", crawlDepth=" + profile.depth()
                            + ", must-match=" + profile.mustMatchPattern().toString()
                            + ", must-not-match=" + profile.mustNotMatchPattern().toString()
                            + ", permission=" + ((this.sb.peers == null) ? "undefined" : (((this.sb.peers.mySeed().isSenior()) || (this.sb.peers.mySeed().isPrincipal())) ? "true" : "false")));


                // work off one Crawl stack entry
                if (urlEntry == null || urlEntry.url() == null) {
                    this.log.logInfo(stats + ": urlEntry = null");
                } else {
                  new Loader(urlEntry);
                }


            } else {
                this.log.logSevere("Unsupported protocol in URL '" + url.toString());
            }
        } else {
            this.log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
        }
    }

View Full Code Here

            // try again and ask another peer
            return remoteCrawlLoaderJob();
        }


        // parse the rss
        DigestURI url, referrer;
        Date loaddate;
        for (final Hit item: feed) {
            //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());


            // put url on remote crawl stack
            try {
                url = new DigestURI(item.getLink());
            } catch (final MalformedURLException e) {
                continue;
            }
            try {
                referrer = new DigestURI(item.getReferrer());
            } catch (final MalformedURLException e) {
                referrer = null;
            }
            loaddate = item.getPubDate();
            final String urlRejectReason = this.sb.crawlStacker.urlInAcceptedDomain(url);

View Full Code Here

    private void enqueueEntries(byte[] initiator, String profileHandle, Map<MultiProtocolURI, Properties> hyperlinks, boolean replace) {
        for (Map.Entry<MultiProtocolURI, Properties> e: hyperlinks.entrySet()) {
            if (e.getKey() == null) continue;
            
            // delete old entry, if exists to force a re-load of the url (thats wanted here)
            final DigestURI url = new DigestURI(e.getKey());
            final byte[] urlhash = url.hash();
            if (replace) {
                indexSegment.urlMetadata().remove(urlhash);
                this.nextQueue.urlRemove(urlhash);
                String u = url.toNormalform(true, true);
                if (u.endsWith("/")) {
                    u = u + "index.html";
                } else if (!u.contains(".")) {
                    u = u + "/index.html";
                }
                try {
                    byte[] uh = new DigestURI(u, null).hash();
                    indexSegment.urlMetadata().remove(uh);
                    this.nextQueue.noticeURL.removeByURLHash(uh);
                    this.nextQueue.errorURL.remove(uh);
                } catch (MalformedURLException e1) {}
            }
            
            if (url.getProtocol().equals("ftp")) {
                // put the whole ftp site on the crawl stack
                enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), replace);
            } else {
                // put entry on crawl stack
                enqueueEntry(new Request(
                        initiator, 
                        url,

View Full Code Here

                    queue = FTPClient.sitelist(host, port);
                    FTPClient.entryInfo entry;
                    while ((entry = queue.take()) != FTPClient.POISON_entryInfo) {
                        
                        // delete old entry, if exists to force a re-load of the url (thats wanted here)
                        DigestURI url = null;
                        try {
                            url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name));
                        } catch (MalformedURLException e) {
                            continue;
                        }
                        final byte[] urlhash = url.hash();
                        if (replace) {
                            indexSegment.urlMetadata().remove(urlhash);
                            cq.noticeURL.removeByURLHash(urlhash);
                            cq.errorURL.remove(urlhash);
                        }

View Full Code Here

            warning = nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry);
            if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
            return null;
        }
        
        final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash());


        // add domain to profile domain list
        if (profile.domMaxPages() != Integer.MAX_VALUE) {
            domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
        }


        if (global) {
            // it may be possible that global == true and local == true, so do not check an error case against it
            if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());

View Full Code Here


    public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed) {
        int loadCount = 0;
        loop: for (final RSSMessage message: feed) {
            try {
                final DigestURI messageurl = new DigestURI(message.getLink());
                if (indexTriggered.containsKey(messageurl.hash())) continue loop;
                if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop;
                sb.addToIndex(messageurl, null, null);
                indexTriggered.insertIfAbsent(messageurl.hash(), new Date());
                loadCount++;
            } catch (final IOException e) {
                Log.logException(e);
            } catch (final Failure e) {
                Log.logException(e);

View Full Code Here

     * @return the loaded entity in a Response object
     * @throws IOException
     */
    private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final long maxFileSize, final boolean checkBlacklist) throws IOException {
        // get the protocol of the next URL
        final DigestURI url = request.url();
        if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
        final String protocol = url.getProtocol();
        final String host = url.getHost();


        // check if we have the page in the cache
        final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
        if (crawlProfile != null && cacheStrategy != CacheStrategy.NOCACHE) {
            // we have passed a first test if caching is allowed
            // now see if there is a cache entry


            final ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
            final byte[] content = (cachedResponse == null) ? null : Cache.getContent(url.hash());
            if (cachedResponse != null && content != null) {
                // yes we have the content


                // create request header values and a response object because we need that
                // in case that we want to return the cached content in the next step
                final RequestHeader requestHeader = new RequestHeader();
                requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
                DigestURI refererURL = null;
                if (request.referrerhash() != null) refererURL = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
                if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
                final Response response = new Response(
                        request,
                        requestHeader,
                        cachedResponse,
                        "200",

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of net.yacy.kelondro.data.meta.DigestURI

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.