Package net.yacy.search.index

Examples of net.yacy.search.index.Segment$ReferenceCleaner


        }

        final int display = post.getInt("display", 1);

        // get segment
        Segment indexSegment = null;
        final boolean authorized = sb.verifyAuthentication(header);
        if (post != null && post.containsKey("segment") && authorized) {
            indexSegment = sb.indexSegments.segment(post.get("segment"));
        } else {
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }

        prop.put("display", display);
        prop.put("error_display", display);

        if (post.containsKey("words"))
            prop.putHTML("error_words", post.get("words"));
        else {
            prop.putHTML("error_words", "");
        }

        final String viewMode = post.get("viewMode","parsed");
        prop.put("error_vMode-" + viewMode, "1");

        DigestURI url = null;
        String descr = "";
        final int wordCount = 0;
        int size = 0;
        boolean pre = false;

        // get the url hash from which the content should be loaded
        String urlHash = post.get("urlHash", "");
        URIMetadataRow urlEntry = null;
        // get the urlEntry that belongs to the url hash
        if (urlHash.length() > 0 && (urlEntry = indexSegment.urlMetadata().load(ASCII.getBytes(urlHash))) != null) {
            // get the url that belongs to the entry
            final URIMetadataRow.Components metadata = urlEntry.metadata();
            if ((metadata == null) || (metadata.url() == null)) {
                prop.put("error", "3");
                prop.put("viewMode", VIEW_MODE_NO_TEXT);
View Full Code Here


        final serverObjects prop = new serverObjects();
        final Switchboard sb = (Switchboard) env;
        prop.put("title", "DbCleanup_p");
       
        // get segment
        Segment indexSegment = null;
        if (post != null && post.containsKey("segment")) {
            String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }
       
        if (post!=null) {
            if (post.get("action").equals("ustart")) {
                if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) {
                    urldbCleanerThread = indexSegment.urlMetadata().getBlacklistCleaner(Switchboard.urlBlacklist, sb.crawlStacker);
                    urldbCleanerThread.start();
                }
                else {
                    urldbCleanerThread.endPause();
                }
            }
            else if (post.get("action").equals("ustop") && (urldbCleanerThread!=null)) {
                urldbCleanerThread.abort();
            }
            else if (post.get("action").equals("upause") && (urldbCleanerThread!=null)) {
                urldbCleanerThread.pause();
            }
            else if (post.get("action").equals("rstart")) {
                if (indexCleanerThread==null || !indexCleanerThread.isAlive()) {
                    indexCleanerThread = indexSegment.getReferenceCleaner(post.get("wordHash","AAAAAAAAAAAA").getBytes());
                    indexCleanerThread.start();
                }
                else {
                    indexCleanerThread.endPause();
                }
            }
            else if (post.get("action").equals("rstop") && (indexCleanerThread!=null)) {
                indexCleanerThread.abort();
            }
            else if (post.get("action").equals("rpause") && (indexCleanerThread!=null)) {
                indexCleanerThread.pause();
            }
            prop.put("LOCATION","");
            return prop;
        }
        if (urldbCleanerThread!=null) {
            prop.put("urldb", "1");
            prop.putNum("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/indexSegment.urlMetadata().size())*100);
            prop.putNum("urldb_blacklisted", urldbCleanerThread.blacklistedUrls);
            prop.putNum("urldb_total", urldbCleanerThread.totalSearchedUrls);
            prop.putHTML("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl);
            prop.put("urldb_lastBlacklistedHash", urldbCleanerThread.lastBlacklistedHash);
            prop.putHTML("urldb_lastUrl", urldbCleanerThread.lastUrl);
View Full Code Here

    public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
        // return variable that accumulates replacements
        final Switchboard sb = (Switchboard) env;
        //wikiCode wikiTransformer = new wikiCode(switchboard);
        final serverObjects prop = new serverObjects();
        Segment segment = null;
        final boolean html = post != null && post.containsKey("html");
        prop.setLocalized(html);
        if (post != null && post.containsKey("segment") && sb.verifyAuthentication(header)) {
            segment = sb.indexSegments.segment(post.get("segment"));
        }
        if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        prop.put("rejected", "0");
        //int showRejectedCount = 10;

        Seed initiator;

        // index size
        prop.putNum("urlpublictextSize", segment.urlMetadata().size());
        prop.putNum("rwipublictextSize", segment.termIndex().sizesMax());

        // loader queue
        prop.putNum("loaderSize", sb.crawlQueues.workerSize());
        prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10));
        if (sb.crawlQueues.workerSize() == 0) {
View Full Code Here

        prop.put("remoteCrawlState", "");
        prop.put("list-remote", 0);
        prop.put("forwardToCrawlStart", "0");

        // get segment
        Segment indexSegment = null;
        if (post != null && post.containsKey("segment")) {
            final String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }

        prop.put("info", "0");

        if (post != null && post.containsKey("continue")) {
            // continue queue
            final String queue = post.get("continue", "");
            if ("localcrawler".equals(queue)) {
                sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
            } else if ("remotecrawler".equals(queue)) {
                sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
            }
        }

        if (post != null && post.containsKey("pause")) {
            // pause queue
            final String queue = post.get("pause", "");
            if ("localcrawler".equals(queue)) {
                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
            } else if ("remotecrawler".equals(queue)) {
                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
            }
        }

        if (post != null && post.containsKey("crawlingstart")) {
            // init crawl
            if (sb.peers == null) {
                prop.put("info", "3");
            } else {
                String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
                // add the prefix http:// if necessary
                int pos = crawlingStart.indexOf("://",0);
                if (pos == -1) {
                    if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart;
                    if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart;
                }

                // remove crawlingFileContent before we record the call
                final String crawlingFileName = post.get("crawlingFile");
                final File crawlingFile = (crawlingFileName != null && crawlingFileName.length() > 0) ? new File(crawlingFileName) : null;
                if (crawlingFile != null && crawlingFile.exists()) {
                    post.remove("crawlingFile$file");
                }

                // normalize URL
                DigestURI crawlingStartURL = null;
                if (crawlingFile == null) try {crawlingStartURL = new DigestURI(crawlingStart);} catch (final MalformedURLException e1) {Log.logException(e1);}
                crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);

                // set new properties
                final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start
                final boolean subPath    = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start

                // set the crawl filter
                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
                final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted
                String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
                final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
                if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL_STRING;
                final String countryMustMatch = post.getBoolean("countryMustMatchSwitch", false) ? post.get("countryMustMatchList", "") : "";
                sb.setConfig("crawlingIPMustMatch", ipMustMatch);
                sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
                if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch);

                // special cases:
                if (crawlingStartURL!= null && fullDomain) {
                    newcrawlingMustMatch = CrawlProfile.mustMatchFilterFullDomain(crawlingStartURL);
                    if (subPath) newcrawlingMustMatch = newcrawlingMustMatch.substring(0, newcrawlingMustMatch.length() - 2) + crawlingStartURL.getPath() + ".*";
                }
                if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
                    newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
                }

                final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
                env.setConfig("crawlOrder", crawlOrder);

                int newcrawlingdepth = post.getInt("crawlingDepth", 8);
                env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
                if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;

                final boolean directDocByURL = "on".equals(post.get("directDocByURL", "off"));
                env.setConfig("crawlingDirectDocByURL", directDocByURL);

                // recrawl
                final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
                boolean crawlingIfOlderCheck = "on".equals(post.get("crawlingIfOlderCheck", "off"));
                int crawlingIfOlderNumber = post.getInt("crawlingIfOlderNumber", -1);
                String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
                int repeat_time = post.getInt("repeat_time", -1);
                final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays

                if ("scheduler".equals(recrawl) && repeat_time > 0) {
                    // set crawlingIfOlder attributes that are appropriate for scheduled crawling
                    crawlingIfOlderCheck = true;
                    crawlingIfOlderNumber = "selminutes".equals(repeat_unit) ? 1 : "selhours".equals(repeat_unit) ? repeat_time / 2 : repeat_time * 12;
                    crawlingIfOlderUnit = "hour";
                } else if ("reload".equals(recrawl)) {
                    repeat_time = -1;
                    crawlingIfOlderCheck = true;
                } else if ("nodoubles".equals(recrawl)) {
                    repeat_time = -1;
                    crawlingIfOlderCheck = false;
                }
                final long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
                env.setConfig("crawlingIfOlder", crawlingIfOlder);

                // store this call as api call
                if (repeat_time > 0) {
                    // store as scheduled api call
                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart), repeat_time, repeat_unit.substring(3));
                } else {
                    // store just a protocol
                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart));
                }

                final boolean crawlingDomMaxCheck = "on".equals(post.get("crawlingDomMaxCheck", "off"));
                final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1;
                env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));

                final boolean crawlingQ = "on".equals(post.get("crawlingQ", "off"));
                env.setConfig("crawlingQ", crawlingQ);

                final boolean indexText = "on".equals(post.get("indexText", "on"));
                env.setConfig("indexText", indexText);

                final boolean indexMedia = "on".equals(post.get("indexMedia", "on"));
                env.setConfig("indexMedia", indexMedia);

                boolean storeHTCache = "on".equals(post.get("storeHTCache", "on"));
                if (crawlingStartURL!= null &&(crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
                env.setConfig("storeHTCache", storeHTCache);

                CacheStrategy cachePolicy = CacheStrategy.parse(post.get("cachePolicy", "iffresh"));
                if (cachePolicy == null) cachePolicy = CacheStrategy.IFFRESH;

                final boolean xsstopw = "on".equals(post.get("xsstopw", "off"));
                env.setConfig("xsstopw", xsstopw);

                final boolean xdstopw = "on".equals(post.get("xdstopw", "off"));
                env.setConfig("xdstopw", xdstopw);

                final boolean xpstopw = "on".equals(post.get("xpstopw", "off"));
                env.setConfig("xpstopw", xpstopw);

                final String crawlingMode = post.get("crawlingMode","url");
                if (crawlingStart != null && crawlingStart.startsWith("ftp")) {
                    try {
                        // check if the crawl filter works correctly
                        Pattern.compile(newcrawlingMustMatch);
                        final CrawlProfile profile = new CrawlProfile(
                                crawlingStart,
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                                newcrawlingdepth,
                                directDocByURL,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText,
                                indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(profile.handle().getBytes(), profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                        final DigestURI url = crawlingStartURL;
                        sb.crawlStacker.enqueueEntriesFTP(sb.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false);
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "7"); // Error with file
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }
                    sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                } else if ("url".equals(crawlingMode)) {

                    // check if pattern matches
                    if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
                        // print error message
                        prop.put("info", "4"); //crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_crawlingStart", crawlingStart);
                    } else try {


                        // check if the crawl filter works correctly
                        Pattern.compile(newcrawlingMustMatch);

                        // stack request
                        // first delete old entry, if exists
                        final DigestURI url = new DigestURI(crawlingStart);
                        final byte[] urlhash = url.hash();
                        indexSegment.urlMetadata().remove(urlhash);
                        sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                        sb.crawlQueues.errorURL.remove(urlhash);

                        // get a scraper to get the title
                        final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH);
                        final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle();
                        final String description = scraper.getDescription();

                        // stack url
                        sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
                        final CrawlProfile pe = new CrawlProfile(
                                (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                                newcrawlingdepth,
                                directDocByURL,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText, indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(pe.handle().getBytes(), pe);
                        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                sb.peers.mySeed().hash.getBytes(),
                                url,
                                null,
                                "CRAWLING-ROOT",
                                new Date(),
                                pe.handle(),
                                0,
                                0,
                                0,
                                0
                                ));

                        if (reasonString == null) {
                            // create a bookmark from crawl start url
                            //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
                            tags.add("crawlStart");
                            final String[] keywords = scraper.getKeywords();
                            if (keywords != null) {
                                for (final String k: keywords) {
                                    final String kk = BookmarkHelper.cleanTagsString(k);
                                    if (kk.length() > 0) tags.add(kk);
                                }
                            }
                            String tagStr = tags.toString();
                            if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2);

                            // we will create always a bookmark to use this to track crawled hosts
                            final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
                            if (bookmark != null) {
                                bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title);
                                bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description);
                                bookmark.setOwner("admin");
                                bookmark.setPublic(false);
                                bookmark.setTags(tags, true);
                                sb.bookmarksDB.saveBookmark(bookmark);
                            }

                            // do the same for ymarks
                            // TODO: could a non admin user add crawls?
                            sb.tables.bookmarks.createBookmark(sb.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");

                            // liftoff!
                            prop.put("info", "8");//start msg
                            prop.putHTML("info_crawlingURL", post.get("crawlingURL"));

                            // generate a YaCyNews if the global flag was set
                            if (!sb.isRobinsonMode() && crawlOrder) {
                                final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
                                m.remove("specificDepth");
                                m.remove("indexText");
                                m.remove("indexMedia");
                                m.remove("remoteIndexing");
                                m.remove("xsstopw");
                                m.remove("xpstopw");
                                m.remove("xdstopw");
                                m.remove("storeTXCache");
                                m.remove("storeHTCache");
                                m.remove("generalFilter");
                                m.remove("specificFilter");
                                m.put("intention", post.get("intention", "").replace(',', '/'));
                                sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), NewsPool.CATEGORY_CRAWL_START, m);
                            }
                        } else {
                            prop.put("info", "5"); //Crawling failed
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                            prop.putHTML("info_reasonString", reasonString);

                            sb.crawlQueues.errorURL.push(
                                new Request(
                                        sb.peers.mySeed().hash.getBytes(),
                                        crawlingStartURL,
                                        null,
                                        "",
                                        new Date(),
                                        pe.handle(),
                                        0,
                                        0,
                                        0,
                                        0),
                                sb.peers.mySeed().hash.getBytes(),
                                new Date(),
                                1,
                                FailCategory.FINAL_LOAD_CONTEXT,
                                reasonString, -1);
                        }
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "6"); // Error with url
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }

                } else if ("file".equals(crawlingMode)) {
                    if (post.containsKey("crawlingFile")) {
                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();

                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                            final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
                            final CrawlProfile profile = new CrawlProfile(
                                    crawlingFileName,
                                    crawlURL,
                                    newcrawlingMustMatch,
                                    CrawlProfile.MATCH_NEVER_STRING,
                                    ipMustMatch,
                                    ipMustNotMatch,
                                    countryMustMatch,
                                    newcrawlingdepth,
                                    false,
                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText,
                                    indexMedia,
                                    storeHTCache,
                                    crawlOrder,
                                    xsstopw,
                                    xdstopw,
                                    xpstopw,
                                    cachePolicy);
                            sb.crawler.putActive(profile.handle().getBytes(), profile);
                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
                        } catch (final PatternSyntaxException e) {
                            prop.put("info", "4"); // crawlfilter does not match url
                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_error", e.getMessage());
                        } catch (final Exception e) {
                            // mist
                            prop.put("info", "7"); // Error with file
                            prop.putHTML("info_crawlingStart", crawlingFileName);
                            prop.putHTML("info_error", e.getMessage());
                            Log.logException(e);
                        }
                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    }
                } else if ("sitemap".equals(crawlingMode)) {
                    final String sitemapURLStr = post.get("sitemapURL","");
                  try {
                    final DigestURI sitemapURL = new DigestURI(sitemapURLStr);
                    final CrawlProfile pe = new CrawlProfile(
                        sitemapURLStr,
                        sitemapURL,
                        CrawlProfile.MATCH_ALL_STRING,
                        CrawlProfile.MATCH_NEVER_STRING,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                        0,
                        false,
                        crawlingIfOlder,
                        crawlingDomMaxPages,
                        true,
                        indexText,
                        indexMedia,
                        storeHTCache,
                        crawlOrder,
                        xsstopw,
                        xdstopw,
                        xpstopw,
                        cachePolicy);
                    sb.crawler.putActive(pe.handle().getBytes(), pe);
                    final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
                    importer.start();
                  } catch (final Exception e) {
                    // mist
                    prop.put("info", "6");//Error with url
                    prop.putHTML("info_crawlingStart", sitemapURLStr);
                    prop.putHTML("info_error", e.getMessage());
                    Log.logException(e);
                  }
                } else if ("sitelist".equals(crawlingMode)) {
                    try {
                        final DigestURI sitelistURL = new DigestURI(crawlingStart);
                        // download document
                        ContentScraper scraper = null;
                        scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
                        // String title = scraper.getTitle();
                        // String description = scraper.getDescription();

                        // get links and generate filter
                        final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                        if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                        // put links onto crawl queue
                        final CrawlProfile profile = new CrawlProfile(
                                sitelistURL.getHost(),
                                sitelistURL,
                                newcrawlingMustMatch,
                                CrawlProfile.MATCH_NEVER_STRING,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                                newcrawlingdepth,
                                directDocByURL,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText,
                                indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(profile.handle().getBytes(), profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                        final Iterator<Map.Entry<MultiProtocolURI, Properties>> linkiterator = hyperlinks.entrySet().iterator();
                        DigestURI nexturl;
                        while (linkiterator.hasNext()) {
                            final Map.Entry<MultiProtocolURI, Properties> e = linkiterator.next();
                            if (e.getKey() == null) continue;
                            nexturl = new DigestURI(e.getKey());
                            // remove the url from the database to be prepared to crawl them again
                            final byte[] urlhash = nexturl.hash();
                            indexSegment.urlMetadata().remove(urlhash);
                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                            sb.crawlQueues.errorURL.remove(urlhash);
                            sb.crawlStacker.enqueueEntry(new Request(
                                    sb.peers.mySeed().hash.getBytes(),
                                    nexturl,
View Full Code Here

            final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"), "text.urlmd", false, false);

            final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total());
            if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");

            final Segment wordIndex = new Segment(
                    log,
                    new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
                    10000,
                    (long) Integer.MAX_VALUE, false, false);
            final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false);

            long urlCounter = 0, wordCounter = 0;
            long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
            String wordChunkStartHash = "AAAAAAAAAAAA", wordChunkEndHash;

            while (indexContainerIterator.hasNext()) {
                ReferenceContainer<WordReference> wordIdxContainer = null;
                try {
                    wordCounter++;
                    wordIdxContainer = indexContainerIterator.next();

                    // the combined container will fit, read the container
                    final Iterator<WordReference> wordIdxEntries = wordIdxContainer.entries();
                    Reference iEntry;
                    while (wordIdxEntries.hasNext()) {
                        iEntry = wordIdxEntries.next();
                        final byte[] urlHash = iEntry.urlhash();
                        if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
                            final URIMetadataRow urlEntry = currentUrlDB.load(urlHash);
                            urlCounter++;
                            minimizedUrlDB.store(urlEntry);
                            if (urlCounter % 500 == 0) {
                                log.logInfo(urlCounter + " URLs found so far.");
                            }
                        } catch (final IOException e) {}
                    }

                    if (wordCounter%500 == 0) {
                        wordChunkEndHash = ASCII.String(wordIdxContainer.getTermHash());
                        wordChunkEnd = System.currentTimeMillis();
                        final long duration = wordChunkEnd - wordChunkStart;
                        log.logInfo(wordCounter + " words scanned " +
                                "[" + wordChunkStartHash + " .. " + wordChunkEndHash + "]\n" +
                                "Duration: "+ 500*1000/duration + " words/s" +
                                " | Free memory: " + MemoryControl.free() +
                                " | Total memory: " + MemoryControl.total());
                        wordChunkStart = wordChunkEnd;
                        wordChunkStartHash = wordChunkEndHash;
                    }

                    // we have read all elements, now we can close it
                    wordIdxContainer = null;

                } catch (final Exception e) {
                    log.logSevere("Exception", e);
                } finally {
                    if (wordIdxContainer != null) try { wordIdxContainer = null; } catch (final Exception e) {}
                }
            }
            log.logInfo("current LURL DB contains " + currentUrlDB.size() + " entries.");
            log.logInfo("mimimized LURL DB contains " + minimizedUrlDB.size() + " entries.");

            currentUrlDB.close();
            minimizedUrlDB.close();
            wordIndex.close();

            // TODO: rename the mimimized UrlDB to the name of the previous UrlDB

            log.logInfo("FINISHED URL CLEANUP, WAIT FOR DUMP");
            log.logInfo("You can now backup your old URL DB and rename minimized/urlHash.db to urlHash.db");
View Full Code Here

        currentUrlDB.deadlinkCleaner();
        currentUrlDB.close();
    }

    private static void RWIHashList(final File dataHome, final File appHome, final String targetName, final String resource, final String format) {
        Segment WordIndex = null;
        final Log log = new Log("HASHLIST");
        final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX");
        final String wordChunkStartHash = "AAAAAAAAAAAA";
        try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
        log.logInfo("STARTING CREATION OF RWI-HASHLIST");
        final File root = dataHome;
        try {
            Iterator<ReferenceContainer<WordReference>> indexContainerIterator = null;
            if (resource.equals("all")) {
                WordIndex = new Segment(
                        log,
                        new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
                        10000,
                        (long) Integer.MAX_VALUE, false, false);
                indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false);
            }
            int counter = 0;
            ReferenceContainer<WordReference> container = null;
            if (format.equals("zip")) {
                log.logInfo("Writing Hashlist to ZIP-file: " + targetName + ".zip");
                final ZipEntry zipEntry = new ZipEntry(targetName + ".txt");
                final File file = new File(root, targetName + ".zip");
                final ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file));
                bos.putNextEntry(zipEntry);
                if(indexContainerIterator != null) {
                    while (indexContainerIterator.hasNext()) {
                        counter++;
                        container = indexContainerIterator.next();
                        bos.write(container.getTermHash());
                        bos.write(serverCore.CRLF);
                        if (counter % 500 == 0) {
                            log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash()));
                        }
                    }
                }
                bos.flush();
                bos.close();
            } else {
                log.logInfo("Writing Hashlist to TXT-file: " + targetName + ".txt");
                final File file = new File(root, targetName + ".txt");
                final BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
                if(indexContainerIterator != null) {
                    while (indexContainerIterator.hasNext()) {
                        counter++;
                        container = indexContainerIterator.next();
                        bos.write(container.getTermHash());
                        bos.write(serverCore.CRLF);
                        if (counter % 500 == 0) {
                            log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash()));
                        }
                    }
                }
                bos.flush();
                bos.close();
            }
            log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : ASCII.String(container.getTermHash())));
        } catch (final IOException e) {
            log.logSevere("IOException", e);
        }
        if (WordIndex != null) {
            WordIndex.close();
            WordIndex = null;
        }
    }
View Full Code Here

            return "no DHT distribution: disabled by network.unit.dht";
        }
        if (getConfig(SwitchboardConstants.INDEX_DIST_ALLOW, "false").equalsIgnoreCase("false")) {
            return "no DHT distribution: not enabled (per setting)";
        }
        final Segment indexSegment = this.indexSegments.segment(segment);
        if (indexSegment.urlMetadata().size() < 10) {
            return "no DHT distribution: loadedURL.size() = " + indexSegment.urlMetadata().size();
        }
        if (indexSegment.termIndex().sizesMax() < 100) {
            return "no DHT distribution: not enough words - wordIndex.size() = " + indexSegment.termIndex().sizesMax();
        }
        if ((getConfig(SwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false").equalsIgnoreCase("false")) && (this.crawlQueues.noticeURL.notEmptyLocal())) {
            return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + this.crawlQueues.noticeURL.size() + ", sbQueue.size() = " + getIndexingProcessorsQueueSize();
        }
        if ((getConfig(SwitchboardConstants.INDEX_DIST_ALLOW_WHILE_INDEXING, "false").equalsIgnoreCase("false")) && (getIndexingProcessorsQueueSize() > 1)) {
View Full Code Here

        for (final String s: sb.indexSegments.segmentNames()) {
            prop.put("segments_" + i + "_name", s);
            prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0);
            i++;
        }
        Segment segment = sb.indexSegments.segment(segmentName);
        prop.put("segments", i);

        // switch off all optional forms/lists
        prop.put("searchresult", 0);
        prop.put("keyhashsimilar", 0);
        prop.put("genUrlList", 0);

        // clean up all search events
        SearchEventCache.cleanupEvents(true);

        if (post != null) {
            // default values
            segmentName = post.get("segment", segmentName).trim();
            i= 0;
            for (final String s: sb.indexSegments.segmentNames()) {
                prop.put("segments_" + i + "_name", s);
                prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0);
                i++;
            }
            prop.put("segments", i);
            segment = sb.indexSegments.segment(segmentName);

            final String keystring = post.get("keystring", "").trim();
            byte[] keyhash = post.get("keyhash", "").trim().getBytes();
            prop.putHTML("keystring", keystring);
            prop.putHTML("keyhash", ASCII.String(keyhash));

            // read values from checkboxes
            final String[] urls = post.getAll("urlhx.*");
            HandleSet urlb = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, urls.length);
            if (urls != null) for (final String s: urls) try { urlb.put(s.getBytes()); } catch (final RowSpaceExceededException e) { Log.logException(e); }
            final boolean delurl    = post.containsKey("delurl");
            final boolean delurlref = post.containsKey("delurlref");

            if (post.containsKey("keystringsearch")) {
                keyhash = Word.word2hash(keystring);
                prop.put("keyhash", keyhash);
                final RWIProcess ranking = genSearchresult(prop, sb, segment, keyhash, null);
                if (ranking.filteredCount() == 0) {
                    prop.put("searchresult", 1);
                    prop.putHTML("searchresult_word", keystring);
                }
            }

            if (post.containsKey("keyhashsearch")) {
                if (keystring.length() == 0 || !ByteBuffer.equals(Word.word2hash(keystring), keyhash)) {
                    prop.put("keystring", "&lt;not possible to compute word from hash&gt;");
                }
                final RWIProcess ranking = genSearchresult(prop, sb, segment, keyhash, null);
                if (ranking.filteredCount() == 0) {
                    prop.put("searchresult", 2);
                    prop.putHTML("searchresult_wordhash", ASCII.String(keyhash));
                }
            }

         // delete everything
            if (post.containsKey("deletecomplete")) {
                if (post.get("deleteIndex", "").equals("on")) {
                    segment.clear();
                }
                if (post.get("deleteSolr", "").equals("on") && sb.getConfigBool("federated.service.solr.indexing.enabled", false)) try {
                    sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().clear();
                } catch (final Exception e) {
                    Log.logException(e);
                }
                if (post.get("deleteCrawlQueues", "").equals("on")) {
                    sb.crawlQueues.clear();
                    sb.crawlStacker.clear();
                    ResultURLs.clearStacks();
                }
                if (post.get("deleteCache", "").equals("on")) {
                    Cache.clear();
                }
                if (post.get("deleteRobots", "").equals("on")) {
                    sb.robots.clear();
                }
                if (post.get("deleteSearchFl", "").equals("on")) {
                    sb.tables.clear(WorkTables.TABLE_SEARCH_FAILURE_NAME);
                }
                post.remove("deletecomplete");
            }

            // set reference limitation
            if (post.containsKey("maxReferencesLimit")) {
                if (post.get("maxReferencesRadio", "").equals("on")) {
                    ReferenceContainer.maxReferences = post.getInt("maxReferences", 0);
                } else {
                    ReferenceContainer.maxReferences = 0;
                }
                sb.setConfig("index.maxReferences", ReferenceContainer.maxReferences);
            }

            // delete word
            if (post.containsKey("keyhashdeleteall")) try {
                if (delurl || delurlref) {
                    // generate urlx: an array of url hashes to be deleted
                    ReferenceContainer<WordReference> index = null;
                    index = segment.termIndex().get(keyhash, null);
                    final Iterator<WordReference> en = index.entries();
                    urlb = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, index.size());
                    while (en.hasNext()) try { urlb.put(en.next().urlhash()); } catch (final RowSpaceExceededException e) { Log.logException(e); }
                    index = null;
                }
                if (delurlref) {
                    segment.removeAllUrlReferences(urlb, sb.loader, CacheStrategy.IFEXIST);
                }
                // delete the word first because that is much faster than the deletion of the urls from the url database
                segment.termIndex().delete(keyhash);
                // now delete all urls if demanded
                if (delurl || delurlref) {
                    for (final byte[] b: urlb) sb.urlRemove(segment, b);
                }
                post.remove("keyhashdeleteall");
                post.put("urllist", "generated");
            } catch (final IOException e) {
                Log.logException(e);
            }

            // delete selected URLs
            if (post.containsKey("keyhashdelete")) try {
                if (delurlref) {
                    segment.removeAllUrlReferences(urlb, sb.loader, CacheStrategy.IFEXIST);
                }
                if (delurl || delurlref) {
                    for (final byte[] b: urlb) sb.urlRemove(segment, b);
                }
                final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
                for (final byte[] b: urlb) try { urlHashes.put(b); } catch (final RowSpaceExceededException e) { Log.logException(e); }
                segment.termIndex().remove(keyhash, urlHashes);
                // this shall lead to a presentation of the list; so handle that the remaining program
                // thinks that it was called for a list presentation
                post.remove("keyhashdelete");
                post.put("urllist", "generated");
            } catch (final IOException e) {
                Log.logException(e);
            }

            if (post.containsKey("urllist")) {
                if (keystring.length() == 0 || !ByteBuffer.equals(Word.word2hash(keystring), keyhash)) {
                    prop.put("keystring", "&lt;not possible to compute word from hash&gt;");
                }
                final Bitfield flags = compileFlags(post);
                final int count = (post.get("lines", "all").equals("all")) ? -1 : post.getInt("lines", -1);
                final RWIProcess ranking = genSearchresult(prop, sb, segment, keyhash, flags);
                genURLList(prop, keyhash, keystring, ranking, flags, count);
            }

            // transfer to other peer
            if (post.containsKey("keyhashtransfer")) try {
                if (keystring.length() == 0 || !ByteBuffer.equals(Word.word2hash(keystring), keyhash)) {
                    prop.put("keystring", "&lt;not possible to compute word from hash&gt;");
                }

                // find host & peer
                String host = post.get("host", ""); // get host from input field
                Seed seed = null;
                if (host.length() != 0) {
                    if (host.length() == 12) {
                        // the host string is a peer hash
                        seed = sb.peers.getConnected(host);
                    } else {
                        // the host string can be a host name
                        seed = sb.peers.lookupByName(host);
                    }
                } else {
                    host = post.get("hostHash", ""); // if input field is empty, get from select box
                    seed = sb.peers.getConnected(host);
                }

                // prepare index
                ReferenceContainer<WordReference> index;
                final long starttime = System.currentTimeMillis();
                index = segment.termIndex().get(keyhash, null);
                // built urlCache
                final Iterator<WordReference> urlIter = index.entries();
                final TreeMap<byte[], URIMetadataRow> knownURLs = new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
                final HandleSet unknownURLEntries = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, index.size());
                Reference iEntry;
                URIMetadataRow lurl;
                while (urlIter.hasNext()) {
                    iEntry = urlIter.next();
                    lurl = segment.urlMetadata().load(iEntry.urlhash());
                    if (lurl == null) {
                        try {
                            unknownURLEntries.put(iEntry.urlhash());
                        } catch (final RowSpaceExceededException e) {
                            Log.logException(e);
                        }
                        urlIter.remove();
                    } else {
                        knownURLs.put(iEntry.urlhash(), lurl);
                    }
                }

                // make an indexContainerCache
                final ReferenceContainerCache<WordReference> icc = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength);
                try {
                    icc.add(index);
                } catch (final RowSpaceExceededException e) {
                    Log.logException(e);
                }

                // transport to other peer
                final boolean gzipBody = sb.getConfigBool("indexControl.gzipBody", false);
                final int timeout = (int) sb.getConfigLong("indexControl.timeout", 60000);
                final String error = Protocol.transferIndex(
                             seed,
                             icc,
                             knownURLs,
                             gzipBody,
                             timeout);
                prop.put("result", (error == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries.size() + " URL not found") : "error: " + error);
                index = null;
            } catch (final IOException e) {
                Log.logException(e);
            }

            // generate list
            if (post.containsKey("keyhashsimilar")) try {
                final Iterator<ReferenceContainer<WordReference>> containerIt = segment.termIndex().referenceContainer(keyhash, true, 256, false).iterator();
                    ReferenceContainer<WordReference> container;
                    i = 0;
                    int rows = 0, cols = 0;
                    prop.put("keyhashsimilar", "1");
                    while (containerIt.hasNext() && i < 256) {
                        container = containerIt.next();
                        prop.put("keyhashsimilar_rows_"+rows+"_cols_"+cols+"_wordHash", container.getTermHash());
                        cols++;
                        if (cols==8) {
                            prop.put("keyhashsimilar_rows_"+rows+"_cols", cols);
                            cols = 0;
                            rows++;
                        }
                        i++;
                    }
                    prop.put("keyhashsimilar_rows_"+rows+"_cols", cols);
                    prop.put("keyhashsimilar_rows", rows + 1);
                    prop.put("result", "");
            } catch (final IOException e) {
                Log.logException(e);
            }

            if (post.containsKey("blacklist")) {
                final String blacklist = post.get("blacklist", "");
                final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, urlb.size());
                if (post.containsKey("blacklisturls")) {
                    PrintWriter pw;
                    try {
                        final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(",");
                        pw = new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true));
                        DigestURI url;
                        for (final byte[] b: urlb) {
                            try {
                                urlHashes.put(b);
                            } catch (final RowSpaceExceededException e) {
                                Log.logException(e);
                            }
                            final URIMetadataRow e = segment.urlMetadata().load(b);
                            segment.urlMetadata().remove(b);
                            if (e != null) {
                                url = e.metadata().url();
                                pw.println(url.getHost() + "/" + url.getFile());
                                for (final String supportedBlacklistType : supportedBlacklistTypes) {
                                    if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist)) {
                                        Switchboard.urlBlacklist.add(
                                                supportedBlacklistType,
                                                url.getHost(),
                                                url.getFile());
                                    }
                                }
                                SearchEventCache.cleanupEvents(true);
                            }
                        }
                        pw.close();
                    } catch (final IOException e) {
                    }
                }

                if (post.containsKey("blacklistdomains")) {
                    PrintWriter pw;
                    try {
                        final String[] supportedBlacklistTypes = Blacklist.BLACKLIST_TYPES_STRING.split(",");
                        pw = new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true));
                        DigestURI url;
                        for (final byte[] b: urlb) {
                            try {
                                urlHashes.put(b);
                            } catch (final RowSpaceExceededException e) {
                                Log.logException(e);
                            }
                            final URIMetadataRow e = segment.urlMetadata().load(b);
                            segment.urlMetadata().remove(b);
                            if (e != null) {
                                url = e.metadata().url();
                                pw.println(url.getHost() + "/.*");
                                for (final String supportedBlacklistType : supportedBlacklistTypes) {
                                    if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist)) {
                                        Switchboard.urlBlacklist.add(
                                                supportedBlacklistType,
                                                url.getHost(), ".*");
                                    }
                                }
                            }
                        }
                        pw.close();
                    } catch (final IOException e) {
                    }
                }
                try {
                    segment.termIndex().remove(keyhash, urlHashes);
                } catch (final IOException e) {
                    Log.logException(e);
                }
            }

            if (prop.getInt("searchresult", 0) == 3) listHosts(prop, keyhash, sb);
        }


        // insert constants
        prop.putNum("wcount", segment.termIndex().sizesMax());
        prop.put("cleanup_maxReferencesRadioChecked", ReferenceContainer.maxReferences > 0 ? 1 : 0);
        prop.put("cleanup_maxReferences", ReferenceContainer.maxReferences > 0 ? ReferenceContainer.maxReferences : 100000);

        // return rewrite properties
        return prop;
View Full Code Here

        final serverObjects prop = new serverObjects();
        final Switchboard sb = (Switchboard) env;

        // get segment
        Segment indexSegment = null;
        if (post != null && post.containsKey("segment")) {
            final String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }

        if (post == null) {
            // send back usage example
            prop.put("mode", "0");

            // get the http host header
            final String hostSocket = header.get(HeaderFramework.CONNECTION_PROP_HOST);

            //String host = hostSocket;
            int port = 80;
            final int pos = hostSocket.indexOf(':',0);
            if (pos != -1) {
                port = Integer.parseInt(hostSocket.substring(pos + 1));
                //host = hostSocket.substring(0, pos);
            }

            prop.put("mode_host", "localhost");
            prop.put("mode_port", port);

            return prop;
        }
        prop.put("mode", "1");

        // get the URL
        String crawlingStart = post.get("url",null);
        crawlingStart = UTF8.decodeURL(crawlingStart);

        // get the browser title
        final String title = post.get("title",null);

        // get other parameters if set
        final String crawlingMustMatch  = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
        final String crawlingMustNotMatch  = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
        final int CrawlingDepth      = post.getInt("crawlingDepth", 0);
        final boolean crawlDynamic   = post.get("crawlingQ", "").equals("on");
        final boolean indexText      = post.get("indexText", "on").equals("on");
        final boolean indexMedia     = post.get("indexMedia", "on").equals("on");
        final boolean storeHTCache   = post.get("storeHTCache", "").equals("on");
        final boolean remoteIndexing = post.get("crawlOrder", "").equals("on");
        final boolean xsstopw        = post.get("xsstopw", "").equals("on");
        final boolean xdstopw        = post.get("xdstopw", "").equals("on");
        final boolean xpstopw        = post.get("xpstopw", "").equals("on");

        prop.put("mode_url", (crawlingStart == null) ? "unknown" : crawlingStart);
        prop.putHTML("mode_title", (title == null) ? "unknown" : title);

        if (crawlingStart != null) {
            crawlingStart = crawlingStart.trim();
            try {crawlingStart = new DigestURI(crawlingStart).toNormalform(true, true);} catch (final MalformedURLException e1) {}

            // check if url is proper
            DigestURI crawlingStartURL = null;
            try {
                crawlingStartURL = new DigestURI(crawlingStart);
            } catch (final MalformedURLException e) {
                prop.put("mode_status", "1");
                prop.put("mode_code", "1");
                return prop;
            }

            final byte[] urlhash = crawlingStartURL.hash();
            indexSegment.urlMetadata().remove(urlhash);
            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
            sb.crawlQueues.errorURL.remove(urlhash);

            // create crawling profile
            CrawlProfile pe = null;
View Full Code Here

TOP

Related Classes of net.yacy.search.index.Segment$ReferenceCleaner

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.