Examples of net.yacy.kelondro.data.meta.DigestURI

net.yacy.kelondro.data.meta.DigestURI
URI-object providing YaCy-hash computation Hashes for URIs are split in several parts For URIs pointing to resources not globally available, the domainhash-part gets one reserved value

            while (true) {
                final String location = getConfig("network.unit.update.location" + i, "");
                if (location.isEmpty()) {
                    break;
                }
                DigestURI locationURL;
                try {
                    // try to parse url
                    locationURL = new DigestURI(location);
                } catch (final MalformedURLException e) {
                    break;
                }
                PublicKey publicKey = null;
                // get public key if it's in config

View Full Code Here

        }


        // in the noIndexReason is set, indexing is not allowed
        if (noIndexReason != null) {
            // log cause and close queue
            final DigestURI referrerURL = response.referrerURL();
            //if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason);
            addURLtoErrorDB(response.url(), (referrerURL == null) ? null : referrerURL.hash(), response.initiator(), response.name(), FailCategory.FINAL_PROCESS_CONTEXT, noIndexReason);
            // finish this entry
            return "not allowed: " + noIndexReason;
        }


        // put document into the concurrent processing queue

View Full Code Here

                if (!(u.startsWith("http://") || u.startsWith("https://") || u.startsWith("ftp://") || u.startsWith("smb://") || u.startsWith("file://"))) continue;
                // enqueue the hyperlink into the pre-notice-url db
                try {
                    this.crawlStacker.enqueueEntry(new Request(
                            response.initiator(),
                            new DigestURI(u),
                            response.url().hash(),
                            nextEntry.getValue(),
                            new Date(),
                            response.profile().handle(),
                            response.depth() + 1,

View Full Code Here

        in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
        if (this.solrConnector != null && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) {
            // send the documents to solr
            for (final Document doc: in.documents) {
                try {
                    final String id = UTF8.String(new DigestURI(doc.dc_identifier(), null).hash());
                    final String iquh = UTF8.String(in.queueEntry.url().hash());
                    if (!id.equals(iquh)) {
                        this.log.logWarning("doc=" + id + ":" + doc.dc_identifier() + ", query=" + iquh  + ":" + in.queueEntry.url());
                        // in case that this happens it appears that the doc id is the right one
                    }

View Full Code Here


    private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser, final SearchEvent searchEvent, final String sourceName) {


        // CREATE INDEX
        final String dc_title = document.dc_title();
        final DigestURI referrerURL = queueEntry.referrerURL();
        EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash);
        if (process == Segments.Process.SURROGATES) {
            processCase = EventOrigin.SURROGATES;
        }


        if (condenser == null || document.indexingDenied()) {
            //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
            addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase);
            return;
        }


        if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
            //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
            addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
            return;
        }


        // remove stopwords
        this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());


        // STORE WORD INDEX
        URIMetadataRow newEntry = null;
        try {
            newEntry = this.indexSegments.segment(process).storeDocument(
                    queueEntry.url(),
                    referrerURL,
                    queueEntry.lastModified(),
                    new Date(),
                    queueEntry.size(),
                    document,
                    condenser,
                    searchEvent,
                    sourceName);
            final RSSFeed feed = yacyChannel.channels(queueEntry.initiator() == null ? yacyChannel.PROXY : Base64Order.enhancedCoder.equal(queueEntry.initiator(), ASCII.getBytes(this.peers.mySeed().hash)) ? yacyChannel.LOCALINDEXING : yacyChannel.REMOTEINDEXING);
            feed.addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false)));
        } catch (final IOException e) {
            //if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
            addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "error storing url: " + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase + ", error = " + e.getMessage());
            return;
        }


        // store rss feeds in document into rss table
        for (final Map.Entry<MultiProtocolURI, String> rssEntry : document.getRSS().entrySet()) {
            final Tables.Data rssRow = new Tables.Data();
            rssRow.put("referrer", queueEntry.url().hash());
            rssRow.put("url", UTF8.getBytes(rssEntry.getKey().toNormalform(true, false)));
            rssRow.put("title", UTF8.getBytes(rssEntry.getValue()));
            rssRow.put("recording_date", new Date());
            try {
                this.tables.update("rss", new DigestURI(rssEntry.getKey()).hash(), rssRow);
            } catch (final IOException e) {
                Log.logException(e);
            }
        }

View Full Code Here

        final Map<MultiProtocolURI, String> matcher = searchEvent.getQuery().separateMatches(links);


        // take the matcher and load them all
        for (final Map.Entry<MultiProtocolURI, String> entry: matcher.entrySet()) {
            try {
                addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName);
            } catch (final IOException e) {
            } catch (final Parser.Failure e) {
            }
        }


        // take then the no-matcher and load them also
        for (final Map.Entry<MultiProtocolURI, String> entry: links.entrySet()) {
            try {
                addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName);
            } catch (final IOException e) {
            } catch (final Parser.Failure e) {
            }
        }
    }

View Full Code Here

                if (r.indexOf("//") < 0) {
                    r = "http://" + r;
                }


                // get the links for a specific site
                DigestURI url;
                try {
                    url = new DigestURI(r);
                } catch (final MalformedURLException e) {
                    Log.logException(e);
                    return;
                }

View Full Code Here

                if (meta >= 0) {
                    final int q = query.indexOf(' ', meta);
                    query = (q >= 0) ? query.substring(0, meta) + query.substring(q + 1) : query.substring(0, meta);
                }
                final String urlString = "http://www.scroogle.org/cgi-bin/nbbw.cgi?Gw=" + query.trim().replaceAll(" ", "+") + "&n=2";
                final DigestURI url;
                try {
                    url = new DigestURI(MultiProtocolURI.unescape(urlString));
                } catch (final MalformedURLException e1) {
                    Log.logWarning("heuristicScroogle", "url not well-formed: '" + urlString + "'");
                    return;
                }

View Full Code Here

                    final int q = query.indexOf(' ', meta);
                    if (q >= 0) query = query.substring(0, meta) + query.substring(q + 1); else query = query.substring(0, meta);
                }


                final String urlString = urlpattern.substring(0, p) + query.trim().replaceAll(" ", "+") + urlpattern.substring(p + 1);
                final DigestURI url;
                try {
                    url = new DigestURI(MultiProtocolURI.unescape(urlString));
                } catch (final MalformedURLException e1) {
                    Log.logWarning("heuristicRSS", "url not well-formed: '" + urlString + "'");
                    return;
                }

View Full Code Here

    public void loadSeedLists() {
        // uses the superseed to initialize the database with known seeds


        yacySeed           ys;
        String             seedListFileURL;
        DigestURI          url;
        Iterator<String>   enu;
        int                lc;
        final int          sc = this.peers.sizeConnected();
        ResponseHeader header;


        final RequestHeader reqHeader = new RequestHeader();
        reqHeader.put(HeaderFramework.PRAGMA, "no-cache");
        reqHeader.put(HeaderFramework.CACHE_CONTROL, "no-cache");
        reqHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
        final HTTPClient client = new HTTPClient();
        client.setHeader(reqHeader.entrySet());
        client.setTimout((int) getConfigLong("bootstrapLoadTimeout", 20000));


        yacyCore.log.logInfo("BOOTSTRAP: " + sc + " seeds known from previous run");


        // - use the superseed to further fill up the seedDB
        int ssc = 0, c = 0;
        while (true) {
            if (Thread.currentThread().isInterrupted()) {
                break;
            }
            seedListFileURL = sb.getConfig("network.unit.bootstrap.seedlist" + c, "");
            if (seedListFileURL.length() == 0) {
                break;
            }
            c++;
            if (
                    seedListFileURL.startsWith("http://") ||
                    seedListFileURL.startsWith("https://")
            ) {
                // load the seed list
                try {


                    url = new DigestURI(seedListFileURL);
                    //final long start = System.currentTimeMillis();
                    client.HEADResponse(url.toString());
                    header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
                    //final long loadtime = System.currentTimeMillis() - start;
                    /*if (header == null) {
                        if (loadtime > getConfigLong("bootstrapLoadTimeout", 6000)) {
                            yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, time-out after " + loadtime + " milliseconds");

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of net.yacy.kelondro.data.meta.DigestURI

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.