Package de.anomic.crawler

Examples of de.anomic.crawler.CrawlProfile$DomProfile


            color_lineend = post.get("colorlineend", color_lineend);
        }

        if (host.equals("auto")) {
          // try to find the host from the crawl profiles
          CrawlProfile e;
            for (final byte[] handle: sb.crawler.getActive()) {
                e = sb.crawler.getActive(handle);
                if (e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
                   continue;
                host = e.name();
                break; // take the first one
            }
        }

        // find start point
View Full Code Here


                setConfig("adminAccount", "");
            }

            // refresh recrawl dates
            try{
                CrawlProfile selentry;
                for (final byte[] handle: this.crawler.getActive()) {
                    selentry = this.crawler.getActive(handle);
                    assert selentry.handle() != null : "profile.name = " + selentry.name();
                    if (selentry.handle() == null) {
                        this.crawler.removeActive(handle);
                        continue;
                    }
                    boolean insert = false;
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (insert) this.crawler.putActive(UTF8.getBytes(selentry.handle()), selentry);
                }
            } catch (final Exception e) {
                Log.logException(e);
            }
View Full Code Here

            condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib);

            // update image result list statistics
            // its good to do this concurrently here, because it needs a DNS lookup
            // to compute a URL hash which is necessary for a double-check
            final CrawlProfile profile = in.queueEntry.profile();
            ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing());
        }
        return new indexingQueueEntry(in.process, in.queueEntry, in.documents, condenser);
    }
View Full Code Here

        if (searchEvent != null) {
            searchEvent.addHeuristic(url.hash(), heuristicName, true);
        }
        if (this.indexSegments.segment(process).exists(url.hash())) return; // don't do double-work
        final Request request = this.loader.request(url, true, true);
        final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
        final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
        if (acceptedError != null) {
            this.log.logWarning("addToIndex: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
            return;
        }
View Full Code Here

TOP

Related Classes of de.anomic.crawler.CrawlProfile$DomProfile

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.