Package net.yacy.search.index

Examples of net.yacy.search.index.Segment$ReferenceCleaner


    public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
        // return variable that accumulates replacements
        final Switchboard sb = (Switchboard) env;
        final serverObjects prop = new serverObjects();
        Segment segment = null;
        final boolean html = post != null && post.containsKey("html");
        prop.setLocalized(html);
        if (post != null && post.containsKey("segment") && sb.verifyAuthentication(header)) {
            segment = sb.indexSegments.segment(post.get("segment"));
        }
        if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC);

        prop.put("rejected", "0");
        sb.updateMySeed();
        final int cacheMaxSize = (int) sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 10000);
        prop.putNum("ppm", sb.currentPPM());
        prop.putNum("qpm", sb.peers.mySeed().getQPM());
        prop.putNum("wordCacheSize", segment.termIndex().getBufferSize());
        prop.putNum("wordCacheMaxSize", cacheMaxSize);

        // crawl queues
        prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount());
        prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize());
View Full Code Here


    public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
        // return variable that accumulates replacements
        final Switchboard sb = (Switchboard) env;

        final serverObjects prop = new serverObjects();
        final Segment segment;
        final boolean html = post != null && post.containsKey("html");
        prop.setLocalized(html);
        final boolean authorized = sb.verifyAuthentication(header);
        if (post != null && post.containsKey("segment") && authorized) {
            segment = sb.indexSegments.segment(post.get("segment"));
        } else {
            segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }

        prop.put("dc_title", "");
        prop.put("dc_creator", "");
        prop.put("dc_description", "");
        prop.put("dc_subject", "");
        prop.put("dc_publisher", "");
        prop.put("dc_contributor", "");
        prop.put("dc_date", "");
        prop.put("dc_type", "");
        prop.put("dc_identifier", "");
        prop.put("dc_language", "");

        if (post == null) return prop;

        final String urlstring = post.get("url", "").trim();
        String urlhash = post.get("urlhash", "").trim();
        if (urlstring.length() == 0 && urlhash.length() == 0) return prop;

        if (urlstring.length() > 0 && urlhash.length() == 0) {
            try {
                final DigestURI url = new DigestURI(urlstring);
                urlhash = ASCII.String(url.hash());
            } catch (final MalformedURLException e) {
                Log.logException(e);
            }
        }
        if (urlhash == null || urlhash.length() == 0) return prop;

        final URIMetadataRow entry = segment.urlMetadata().load(urlhash.getBytes());
        if (entry == null) return prop;

        final URIMetadataRow.Components metadata = entry.metadata();
        if (metadata.url() == null) {
            return prop;
        }
        final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());

        prop.putXML("dc_title", metadata.dc_title());
        prop.putXML("dc_creator", metadata.dc_creator());
        prop.putXML("dc_description", ""); // this is the fulltext part in the surrogate
        prop.putXML("dc_subject", metadata.dc_subject());
View Full Code Here

        final Switchboard sb = (Switchboard) env;
        final serverObjects prop = new serverObjects();
        File defaultSettingsFile = new File(sb.getAppPath(), "defaults/yacy.init");
       
        // get segment
        Segment indexSegment = null;
        if (post != null && post.containsKey("segment")) {
            String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }
       
        if(post != null) {
          if(post.containsKey("defaultFile")){
              // TODO check file-path!
              final File value = new File(sb.getAppPath(), post.get("defaultFile", "defaults/yacy.init"));
              // check if value is readable file
              if(value.exists() && value.isFile() && value.canRead()) {
                  defaultSettingsFile = value;
              }
          }
            if (post.containsKey("Xmx")) {
                int xmx = post.getInt("Xmx", 500); // default maximum heap size
                if (OS.isWin32) xmx = Math.min(2000, xmx);
                int xms = xmx; // take all.. if this is not used the os will manage that. if not reserved at the beginning the OS may reject to give away more memory
              sb.setConfig("javastart_Xmx", "Xmx" + xmx + "m");
              sb.setConfig("javastart_Xms", "Xms" + xms + "m");
              prop.put("setStartupCommit", "1");
            }
            if(post.containsKey("diskFree")) {
              sb.setConfig(SwitchboardConstants.DISK_FREE, post.getInt("diskFree", 3000));
            }
            if(post.containsKey("diskFreeHardlimit")) {
              sb.setConfig(SwitchboardConstants.DISK_FREE_HARDLIMIT, post.getInt("diskFreeHardlimit", 1000));
            }
            if(post.containsKey("memoryAcceptDHT")) {
              sb.setConfig(SwitchboardConstants.MEMORY_ACCEPTDHT, post.getInt("memoryAcceptDHT", 50));
            }
            if(post.containsKey("resetObserver")) {
              MemoryControl.resetProperState();
            }
        }
        final Map<String, String> defaultSettings = ((post == null) || (!(post.containsKey("submitdefault")))) ? null : FileUtils.loadMap(defaultSettingsFile);
        Iterator<String> threads = sb.threadNames();
        String threadName;
        BusyThread thread;
       
        final boolean xml = (header.get(HeaderFramework.CONNECTION_PROP_PATH)).endsWith(".xml");
        prop.setLocalized(!xml);
       
        // calculate totals
        long blocktime_total = 0, sleeptime_total = 0, exectime_total = 0;
        while (threads.hasNext()) {
            threadName = threads.next();
            thread = sb.getThread(threadName);
            blocktime_total += thread.getBlockTime();
            sleeptime_total += thread.getSleepTime();
            exectime_total += thread.getExecTime();
        }  
        if (blocktime_total == 0) blocktime_total = 1;
        if (sleeptime_total == 0) sleeptime_total = 1;
        if (exectime_total == 0) exectime_total = 1;
       
        // set templates for latest news from the threads
        long blocktime, sleeptime, exectime;
        long idlesleep, busysleep, memuse, memprereq;
        int queuesize;
        threads = sb.threadNames();
        int c = 0;
        long idleCycles, busyCycles, memshortageCycles;
        // set profile?
        final double multiplier = (post != null) && post.containsKey("profileSpeed") ? 100.0 / post.getFloat("profileSpeed", 100.0f) : 1.0;
        final boolean setProfile = (post != null && post.containsKey("submitdefault"));
        final boolean setDelay = (post != null) && (post.containsKey("submitdelay"));
        // save used settings file to config
        if (setProfile && post != null){
          sb.setConfig("performanceProfile", post.get("defaultFile", "defaults/yacy.init"));
          sb.setConfig("performanceSpeed", post.getInt("profileSpeed", 100));
        }
       
        while (threads.hasNext()) {
            threadName = threads.next();
            thread = sb.getThread(threadName);
           
            // set values to templates
            prop.put("table_" + c + "_threadname", threadName);

      prop.putHTML("table_" + c + "_hasurl_shortdescr", thread.getShortDescription());
      if(thread.getMonitorURL() == null) {
        prop.put("table_"+c+"_hasurl", "0");
      }else{
        prop.put("table_"+c+"_hasurl", "1");
        prop.put("table_" + c + "_hasurl_url", thread.getMonitorURL());
      }
            prop.putHTML("table_" + c + "_longdescr", thread.getLongDescription());
            queuesize = thread.getJobCount();
            prop.put("table_" + c + "_queuesize", (queuesize == Integer.MAX_VALUE) ? "unlimited" : Formatter.number(queuesize, !xml));
           
            blocktime = thread.getBlockTime();
            sleeptime = thread.getSleepTime();
            exectime = thread.getExecTime();
            memuse = thread.getMemoryUse();
            idleCycles = thread.getIdleCycles();
            busyCycles = thread.getBusyCycles();
            memshortageCycles = thread.getOutOfMemoryCycles();
            prop.putNum("table_" + c + "_blocktime", blocktime / 1000);
            prop.putNum("table_" + c + "_blockpercent", 100 * blocktime / blocktime_total);
            prop.putNum("table_" + c + "_sleeptime", sleeptime / 1000);
            prop.putNum("table_" + c + "_sleeppercent", 100 * sleeptime / sleeptime_total);
            prop.putNum("table_" + c + "_exectime", exectime / 1000);
            prop.putNum("table_" + c + "_execpercent", 100 * exectime / exectime_total);
            prop.putNum("table_" + c + "_totalcycles", idleCycles + busyCycles + memshortageCycles);
            prop.putNum("table_" + c + "_idlecycles", idleCycles);
            prop.putNum("table_" + c + "_busycycles", busyCycles);
            prop.putNum("table_" + c + "_memscycles", memshortageCycles);
            prop.putNum("table_" + c + "_sleeppercycle", ((idleCycles + busyCycles) == 0) ? -1 : sleeptime / (idleCycles + busyCycles));
            prop.putNum("table_" + c + "_execpercycle", (busyCycles == 0) ? -1 : exectime / busyCycles);
            prop.putNum("table_" + c + "_memusepercycle", (busyCycles == 0) ? -1 : memuse / busyCycles / 1024);
           
            // load with old values
            idlesleep = sb.getConfigLong(threadName + "_idlesleep" , 1000);
            busysleep = sb.getConfigLong(threadName + "_busysleep",   100);
            memprereq = sb.getConfigLong(threadName + "_memprereq",     0);
            if (setDelay && post != null) {
                // load with new values
                idlesleep = post.getLong(threadName + "_idlesleep", idlesleep);
                busysleep = post.getLong(threadName + "_busysleep", busysleep);
                memprereq = post.getLong(threadName + "_memprereq", memprereq) * 1024;
                if (memprereq == 0) memprereq = sb.getConfigLong(threadName + "_memprereq", 0);
                   
                // check values to prevent short-cut loops
                if (idlesleep < 1000) idlesleep = 1000;
                if (threadName.equals("10_httpd")) { idlesleep = 0; busysleep = 0; memprereq = 0; }
               
                sb.setThreadPerformance(threadName, idlesleep, busysleep, memprereq);
                idlesleep = sb.getConfigLong(threadName + "_idlesleep", idlesleep);
                busysleep = sb.getConfigLong(threadName + "_busysleep", busysleep);
            }
            if (setProfile) {
                if (threadName.equals(SwitchboardConstants.PEER_PING) ||
                    threadName.equals(SwitchboardConstants.SEED_UPLOAD) ||
                    threadName.equals(SwitchboardConstants.CLEANUP)) {
                    /* do not change any values */
                } else {
                    // load with new values
                    idlesleep = (long) (Long.parseLong(d(defaultSettings.get(threadName + "_idlesleep"), String.valueOf(idlesleep))) * multiplier);
                    busysleep = (long) (Long.parseLong(d(defaultSettings.get(threadName + "_busysleep"), String.valueOf(busysleep))) * multiplier);
                    //memprereq = (long) (Long.parseLong(d(defaultSettings.get(threadName + "_memprereq"), String.valueOf(memprereq))) * multiplier);

                    // check values to prevent short-cut loops
                    if (idlesleep < 1000) idlesleep = 1000;
                    if (threadName.equals("10_httpd")) { idlesleep = 0; busysleep = 0; memprereq = 0; }
                    //if (threadName.equals(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) && (busysleep < 50)) busysleep = 50;
                    sb.setThreadPerformance(threadName, idlesleep, busysleep, memprereq);
                }
            }
            prop.put("table_" + c + "_idlesleep", idlesleep);
            prop.put("table_" + c + "_busysleep", busysleep);
            prop.put("table_" + c + "_memprereq", memprereq / 1024);
            // disallow setting of memprereq for indexer to prevent db from throwing OOMs
            prop.put("table_" + c + "_disabled", /*(threadName.endsWith("_indexing")) ? 1 :*/ "0");
            prop.put("table_" + c + "_recommendation", threadName.endsWith("_indexing") ? "1" : "0");
            prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (indexSegment.termIndex().minMem() / 1024) : 0);
            c++;
        }
        prop.put("table", c);
       
        // performance profiles
        c = 0;
        final String usedfile = sb.getConfig("performanceProfile", "defaults/yacy.init");
        for(final String filename: performanceProfiles.keySet()) {
            prop.put("profile_" + c + "_filename", filename);
            prop.put("profile_" + c + "_description", performanceProfiles.get(filename));
            prop.put("profile_" + c + "_used", usedfile.equalsIgnoreCase(filename) ? "1" : "0");
            c++;
        }
        prop.put("profile", c);
       
        c = 0;
        final int[] speedValues = {200,150,100,50,25,10};
        final int usedspeed = sb.getConfigInt("performanceSpeed", 100);
        for(final int speed: speedValues){
          prop.put("speed_" + c + "_value", speed);
          prop.put("speed_" + c + "_label", speed + " %");
          prop.put("speed_" + c + "_used", (speed == usedspeed) ? "1" : "0");
          c++;
        }
        prop.put("speed", c);
       
        if ((post != null) && (post.containsKey("cacheSizeSubmit"))) {
            final int wordCacheMaxCount = post.getInt("wordCacheMaxCount", 20000);
            sb.setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount));
            indexSegment.termIndex().setBufferMaxWordCount(wordCacheMaxCount);
        }
       
        if ((post != null) && (post.containsKey("poolConfig"))) {
           
            /*
             * configuring the crawler pool
             */
            // get the current crawler pool configuration
            int maxBusy = post.getInt("Crawler Pool_maxActive", 8);
           
            // storing the new values into configfile
            sb.setConfig(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX,maxBusy);
            //switchboard.setConfig("crawler.MinIdleThreads",minIdle);
           
            /*
             * configuring the http pool
             */
            final WorkflowThread httpd = sb.getThread("10_httpd");
            try {
                maxBusy = post.getInt("httpd Session Pool_maxActive", 8);
            } catch (final NumberFormatException e) {
                maxBusy = 8;
            }

            ((serverCore)httpd).setMaxSessionCount(maxBusy);   
           
            // storing the new values into configfile
            sb.setConfig("httpdMaxBusySessions",maxBusy);

        }       
       
        if ((post != null) && (post.containsKey("PrioritySubmit"))) {
          sb.setConfig("javastart_priority",post.get("YaCyPriority","0"));
        }
       
        if ((post != null) && (post.containsKey("onlineCautionSubmit"))) {
            sb.setConfig(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseProxy", 30000)));
            sb.setConfig(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseLocalsearch", 30000)));
            sb.setConfig(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseRemotesearch", 30000)));
        }
       
        if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) {
            final long minimumLocalDelta = post.getLong("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
            final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
            sb.setConfig("minimumLocalDelta", minimumLocalDelta);
            sb.setConfig("minimumGlobalDelta", minimumGlobalDelta);
            sb.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
        }
       
        // delta settings
        prop.put("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
        prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
       
        // table cache settings
        prop.putNum("urlCacheSize", indexSegment.urlMetadata().writeCacheSize())
        prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize());
        prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024);
        prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences());
        prop.putNum("maxAgeOfCache", indexSegment.termIndex().getBufferMaxAge() / 1000 / 60); // minutes
        prop.putNum("minAgeOfCache", indexSegment.termIndex().getBufferMinAge() / 1000 / 60); // minutes
        prop.putNum("maxWaitingWordFlush", sb.getConfigLong("maxWaitingWordFlush", 180));
        prop.put("wordCacheMaxCount", sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000));
        prop.put("crawlPauseProxy", sb.getConfigLong(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, 30000));
        prop.put("crawlPauseLocalsearch", sb.getConfigLong(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, 30000));
        prop.put("crawlPauseRemotesearch", sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, 30000));
View Full Code Here

        for (final String s: sb.indexSegments.segmentNames()) {
            prop.put("segments_" + i + "_name", s);
            prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0);
            i++;
        }
        Segment segment = sb.indexSegments.segment(segmentName);
        prop.put("segments", i);
        prop.putNum("ucount", segment.urlMetadata().size());
        prop.put("otherHosts", "");
        prop.put("genUrlProfile", 0);
        prop.put("statistics", 1);
        prop.put("statistics_lines", 100);
        prop.put("statisticslines", 0);
        prop.put("reload", 0);

        // do segment selection
        if (post != null && post.containsKey("segment")) {
            // default values
            segmentName = post.get("segment", segmentName).trim();
            i= 0;
            for (final String s: sb.indexSegments.segmentNames()) {
                prop.put("segments_" + i + "_name", s);
                prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0);
                i++;
            }
            prop.put("segments", i);
            segment = sb.indexSegments.segment(segmentName);
        }

        // show export messages
        final MetadataRepository.Export export = segment.urlMetadata().export();
        if ((export != null) && (export.isAlive())) {
          // there is currently a running export
            prop.put("lurlexport", 2);
            prop.put("lurlexportfinished", 0);
        prop.put("lurlexporterror", 0);
        prop.put("lurlexport_exportfile", export.file().toString());
            prop.put("lurlexport_urlcount", export.count());
            prop.put("reload", 1);
        } else {
            prop.put("lurlexport", 1);
            prop.put("lurlexport_exportfile", sb.getDataPath() + "/DATA/EXPORT/" + GenericFormatter.SHORT_SECOND_FORMATTER.format());
            if (export == null) {
                // there has never been an export
                prop.put("lurlexportfinished", 0);
                prop.put("lurlexporterror", 0);
            } else {
                // an export was running but has finished
                prop.put("lurlexportfinished", 1);
                prop.put("lurlexportfinished_exportfile", export.file().toString());
                prop.put("lurlexportfinished_urlcount", export.count());
                if (export.failed() == null) {
                    prop.put("lurlexporterror", 0);
                } else {
                    prop.put("lurlexporterror", 1);
                    prop.put("lurlexporterror_exportfile", export.file().toString());
                    prop.put("lurlexporterror_exportfailmsg", export.failed());
                }
            }
        }

        if (post == null || env == null) {
            return prop; // nothing to do
        }

        // post values that are set on numerous input fields with same name
        String urlstring = post.get("urlstring", "").trim();
        String urlhash = post.get("urlhash", "").trim();
        if (urlhash.length() == 0 && urlstring.length() > 0) {
            try {
                urlhash = ASCII.String(new DigestURI(urlstring, null).hash());
            } catch (final MalformedURLException e) {
            }
        }

        if (!urlstring.startsWith("http://") &&
            !urlstring.startsWith("https://") &&
            !urlstring.startsWith("ftp://") &&
            !urlstring.startsWith("smb://") &&
            !urlstring.startsWith("file://")) { urlstring = "http://" + urlstring; }

        prop.putHTML("urlstring", urlstring);
        prop.putHTML("urlhash", urlhash);
        prop.put("result", " ");

        if (post.containsKey("urlhashdeleteall")) {
            i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, CacheStrategy.IFEXIST);
            prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urlhashdelete")) {
            final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
            if (entry == null) {
                prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
            } else {
                urlstring = entry.metadata().url().toNormalform(false, true);
                prop.put("urlstring", "");
                sb.urlRemove(segment, urlhash.getBytes());
                prop.putHTML("result", "Removed URL " + urlstring);
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urldelete")) {
            try {
                urlhash = ASCII.String((new DigestURI(urlstring)).hash());
            } catch (final MalformedURLException e) {
                urlhash = null;
            }
            if ((urlhash == null) || (urlstring == null)) {
                prop.put("result", "No input given; nothing deleted.");
            } else {
                sb.urlRemove(segment, urlhash.getBytes());
                prop.putHTML("result", "Removed URL " + urlstring);
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urlstringsearch")) {
            try {
                final DigestURI url = new DigestURI(urlstring);
                urlhash = ASCII.String(url.hash());
                prop.put("urlhash", urlhash);
                final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
                if (entry == null) {
                    prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true));
                    prop.putHTML("urlstring", urlstring);
                    prop.put("urlhash", "");
                } else {
                    prop.putAll(genUrlProfile(segment, entry, urlhash));
                    prop.put("statistics", 0);
                }
            } catch (final MalformedURLException e) {
                prop.putHTML("result", "bad url: " + urlstring);
                prop.put("urlhash", "");
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urlhashsearch")) {
            final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
            if (entry == null) {
                prop.putHTML("result", "No Entry for URL hash " + urlhash);
            } else {
                prop.putHTML("urlstring", entry.metadata().url().toNormalform(false, true));
                prop.putAll(genUrlProfile(segment, entry, urlhash));
                prop.put("statistics", 0);
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        // generate list
        if (post.containsKey("urlhashsimilar")) {
            try {
                final Iterator<URIMetadataRow> entryIt = new RotateIterator<URIMetadataRow>(segment.urlMetadata().entries(true, urlhash), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax());
                final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
                URIMetadataRow entry;
                i = 0;
                int rows = 0, cols = 0;
                prop.put("urlhashsimilar", "1");
                while (entryIt.hasNext() && i < 256) {
                    entry = entryIt.next();
                    if (entry == null) break;
                    prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", ASCII.String(entry.hash()));
                    cols++;
                    if (cols==8) {
                        prop.put("urlhashsimilar_rows_"+rows+"_cols", cols);
                        cols = 0;
                        rows++;
                    }
                    i++;
                }
                prop.put("statistics", 0);
                prop.put("urlhashsimilar_rows", rows);
                prop.put("result", result.toString());
            } catch (final IOException e) {
                prop.putHTML("result", "No Entries for URL hash " + urlhash);
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("lurlexport")) {
            // parse format
            int format = 0;
            final String fname = post.get("format", "url-text");
            final boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, otherwise only the domain
            if (fname.endsWith("text")) format = 0;
            if (fname.endsWith("html")) format = 1;
            if (fname.endsWith("rss")) format = 2;

            // extend export file name
      String s = post.get("exportfile", "");
      if (s.indexOf('.',0) < 0) {
        if (format == 0) s = s + ".txt";
        if (format == 1) s = s + ".html";
        if (format == 2) s = s + ".xml";
      }
          final File f = new File(s);
      f.getParentFile().mkdirs();
      final String filter = post.get("exportfilter", ".*");
      final MetadataRepository.Export running = segment.urlMetadata().export(f, filter, null, format, dom);

      prop.put("lurlexport_exportfile", s);
      prop.put("lurlexport_urlcount", running.count());
      if ((running != null) && (running.failed() == null)) {
        prop.put("lurlexport", 2);
      }
      prop.put("reload", 1);
        }

        if (post.containsKey("deletedomain")) {
            final String hp = post.get("hashpart");
            try {
                segment.urlMetadata().deleteDomain(hp);
            } catch (final IOException e) {
                // TODO Auto-generated catch block
                Log.logException(e);
            }
            // trigger the loading of the table
            post.put("statistics", "");
            prop.put("reload", 0);
        }

        if (post.containsKey("statistics")) {
            final int count = post.getInt("lines", 100);
            Iterator<MetadataRepository.HostStat> statsiter;
            prop.put("statistics_lines", count);
            int cnt = 0;
            try {
                final MetadataRepository metadata = segment.urlMetadata();
                statsiter = metadata.statistics(count, metadata.urlSampleScores(metadata.domainSampleCollector()));
                boolean dark = true;
                MetadataRepository.HostStat hs;
                while (statsiter.hasNext() && cnt < count) {
                    hs = statsiter.next();
                    prop.put("statisticslines_domains_" + cnt + "_dark", (dark) ? "1" : "0");
                    prop.put("statisticslines_domains_" + cnt + "_domain", hs.hostname + ((hs.port == 80) ? "" : ":" + hs.port));
                    prop.put("statisticslines_domains_" + cnt + "lines", count);
                    prop.put("statisticslines_domains_" + cnt + "_hashpart", hs.hosthash);
                    prop.put("statisticslines_domains_" + cnt + "_count", hs.count);
                    dark = !dark;
                    cnt++;
                }
            } catch (final IOException e) {
                Log.logException(e);
            }
            prop.put("statisticslines_domains", cnt);
            prop.put("statisticslines", 1);
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        // insert constants
        prop.putNum("ucount", segment.urlMetadata().size());
        // return rewrite properties
        return prop;
    }
View Full Code Here

        QueryParams theQuery = null;
        SearchEvent theSearch = null;
        ArrayList<WeakPriorityBlockingQueue.Element<ResultEntry>> accu = null;
        if (query.length() == 0 && abstractSet != null) {
            // this is _not_ a normal search, only a request for index abstracts
            final Segment indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
            theQuery = new QueryParams(
                    null,
                    abstractSet,
                    new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0),
                    null,
                    snippetPattern,
                    null,
                    modifier,
                    maxdist,
                    prefer,
                    ContentDomain.contentdomParser(contentdom),
                    language,
                    "", // no navigation
                    CacheStrategy.CACHEONLY,
                    count,
                    0,
                    filter,
                    QueryParams.Searchdom.LOCAL,
                    -1,
                    null,
                    false,
                    sitehash,
                    authorhash,
                    DigestURI.TLD_any_zone_filter,
                    client,
                    false,
                    indexSegment,
                    rankingProfile,
                    header.get(RequestHeader.USER_AGENT, ""),
                    false
                    );
            Network.log.logInfo("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");

            final long timer = System.currentTimeMillis();
            //final Map<byte[], ReferenceContainer<WordReference>>[] containers = sb.indexSegment.index().searchTerm(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2StringSet(urls));
            final TreeMap<byte[], ReferenceContainer<WordReference>> incc = indexSegment.termIndex().searchConjunction(theQuery.queryHashes, QueryParams.hashes2Handles(urls));

            EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theQuery.id(true), SearchEvent.Type.COLLECTION, "", incc.size(), System.currentTimeMillis() - timer), false);
            if (incc != null) {
                final Iterator<Map.Entry<byte[], ReferenceContainer<WordReference>>> ci = incc.entrySet().iterator();
                Map.Entry<byte[], ReferenceContainer<WordReference>> entry;
View Full Code Here

       
        final serverObjects prop = new serverObjects();
        if ((post == null) || (env == null)) return prop;
        final boolean authenticated = sb.adminAuthenticated(header) >= 2;
       
        Segment segment = null;
        if (post.containsKey("segment") && authenticated) {
            segment = sb.indexSegments.segment(post.get("segment"));
        } else {
            segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }
       
        final String  querystring = post.get("query", "")// a string of word hashes that shall be searched and combined
        final int     count  = Math.min((authenticated) ? 1000 : 10, post.getInt("maximumRecords", 1000)); // SRU syntax
        final int     maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
        String  language = post.get("language", "");
        if (!ISO639.exists(language)) {
            // take language from the user agent
            String agent = header.get("User-Agent");
            if (agent == null) agent = System.getProperty("user.language");
            language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent);
            if (language == null) language = "en";
        }
        final TreeSet<String>[] query = QueryParams.cleanQuery(querystring); // converts also umlaute
        HandleSet q = Word.words2hashesHandles(query[0]);
       
        // tell all threads to do nothing for a specific time
        sb.intermissionAllThreads(3000);

        // prepare search
        final long timestamp = System.currentTimeMillis();
       
        // prepare an abstract result
        int indexabstractContainercount = 0;
        int joincount = 0;

        // retrieve index containers
        //yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links");
       
        // get the index container with the result vector
        TermSearch<WordReference> search = null;
        try {
            search = segment.termIndex().query(q, Word.words2hashesHandles(query[1]), null, Segment.wordReferenceFactory, maxdist);
        } catch (RowSpaceExceededException e) {
            Log.logException(e);
        }
        ReferenceContainer<WordReference> index = search.joined();
       
View Full Code Here

    public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {

      final Log log = new Log("TERMLIST");
        final serverObjects prop = new serverObjects();
        final Switchboard sb = (Switchboard) env;
        Segment segment = null;
        final boolean delete = post != null && post.containsKey("delete");
        final long mincount = post == null ? 10000 : post.getLong("mincount", 10000);
        if (post != null && post.containsKey("segment") && sb.verifyAuthentication(header)) {
            segment = sb.indexSegments.segment(post.get("segment"));
        }
        if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        final Iterator<Rating<byte[]>> i = segment.termIndex().referenceCountIterator(null, false);
        Rating<byte[]> e;
        int c = 0, termnumber = 0;
        byte[] termhash, maxterm = null;
        long count, mem, maxcount = 0, totalmemory = 0;
        String hstring;
        final Row referenceRow = segment.termIndex().referenceRow();
        final int rowsize = referenceRow.objectsize;
        final ArrayList<byte[]> deleteterms = new ArrayList<byte[]>();
        long over1000 = 0, over10000 = 0, over100000 = 0, over1000000 = 0, over10000000 = 0, over100000000 = 0;
        while (i.hasNext()) {
            e = i.next();
            termnumber++;
            count = e.getScore();
            if (count >= 1000) over1000++;
            if (count >= 10000) over10000++;
            if (count >= 100000) over100000++;
            if (count >= 1000000) over1000000++;
            if (count >= 10000000) over10000000++;
            if (count >= 100000000) over100000000++;
            if (count > maxcount) {
                maxcount = count;
                maxterm = e.getObject();
            }
            if (count < mincount) continue;
            termhash = e.getObject();
            if (delete) deleteterms.add(termhash);
            hstring = ASCII.String(termhash);
            mem = 20 + count * rowsize;
            prop.put("terms_" + c + "_termhash", hstring);
            prop.put("terms_" + c + "_count", count);
            prop.put("terms_" + c + "_memory", mem);
            //log.logWarning("termhash: " + hstring + " | count: " + count + " | memory: " + mem);
            c++;
            totalmemory += mem;
        }
        if (delete) {
            for (final byte[] t: deleteterms) {
                try {
                    segment.termIndex().delete(t);
                } catch (final IOException e1) {
                  log.logWarning("Error deleting " + ASCII.String(t), e1);
                    e1.printStackTrace();
                }
            }
View Full Code Here

        final String querystring =  originalquerystring.replace('+', ' ');
        final int timeout = (post == null) ? 300 : post.getInt("timeout", 300);
        final int count = (post == null) ? 20 : post.getInt("count", 20);

        // get segment
        final Segment indexSegment;
        if (post != null && post.containsKey("segment")) {
            final String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            } else {
                // take default segment
                indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }

        int c = 0;
        if (more ||
                (indexSegment != null &&
                !indexSegment.termIndex().has(Word.word2hash(querystring))))
        {
            final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring));
            final Iterator<StringBuilder> meanIt = didYouMean.getSuggestions(timeout, count).iterator();
            String suggestion;
            //[#[query]#,[#{suggestions}##[text]##(eol)#,::#(/eol)##{/suggestions}#]]
            while (c < meanMax && meanIt.hasNext()) {
                suggestion = meanIt.next().toString();
View Full Code Here

      // return variable that accumulates replacements
        final Switchboard sb = (Switchboard) env;
        final serverObjects prop = new serverObjects();

        // get segment
        Segment indexSegment = null;
        if (post != null && post.containsKey("segment")) {
            String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }
       
        if (post == null) {
            prop.put("linkfreq", sb.getConfigLong("defaultLinkReceiveFrequency",30));
            prop.put("wordfreq", sb.getConfigLong("defaultWordReceiveFrequency",10));
            prop.put("dtable", "");
            prop.put("rtable", "");
            prop.putNum("wcount", indexSegment.termIndex().sizesMax());
            prop.putNum("ucount", indexSegment.urlMetadata().size());
            return prop; // be save
        }
       
        if (post.containsKey("indexsharesetting")) {
            sb.setConfig(SwitchboardConstants.INDEX_DIST_ALLOW, post.containsKey("distribute"));
            sb.setConfig("allowReceiveIndex", post.containsKey("receive"));
            sb.setConfig("defaultLinkReceiveFrequency", post.getInt("linkfreq", 30));
            sb.setConfig("defaultWordReceiveFrequency", post.getInt("wordfreq", 10));
        }

        // insert constants
        prop.putNum("wcount", indexSegment.termIndex().sizesMax());
        prop.putNum("ucount", indexSegment.urlMetadata().size());
       
        // return rewrite properties
        return prop;
    }
View Full Code Here

        CacheStrategy snippetFetchStrategy = (post == null) ? null : CacheStrategy.parse(post.get("verify", "cacheonly"));
        final servletProperties prop = new servletProperties();
        prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0);

        // get segment
        Segment indexSegment = null;
        if (post != null && post.containsKey("segment")) {
            final String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }

        final String EXT = header.get("EXT", "");
        final boolean rss = EXT.equals("rss");
        final boolean json = EXT.equals("json");
        prop.put("promoteSearchPageGreeting", promoteSearchPageGreeting);
        prop.put("promoteSearchPageGreeting.homepage", sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, ""));
        prop.put("promoteSearchPageGreeting.smallImage", sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, ""));
        if (post == null || indexSegment == null || env == null || !searchAllowed) {
            // we create empty entries for template strings
            prop.put("searchagain", "0");
            prop.put("former", "");
            prop.put("count", "10");
            prop.put("offset", "0");
            prop.put("resource", "global");
            prop.put("urlmaskfilter", (post == null) ? ".*" : post.get("urlmaskfilter", ".*"));
            prop.put("prefermaskfilter", (post == null) ? "" : post.get("prefermaskfilter", ""));
            prop.put("tenant", (post == null) ? "" : post.get("tenant", ""));
            prop.put("indexof", "off");
            prop.put("constraint", "");
            prop.put("cat", "href");
            prop.put("depth", "0");
            prop.put("search.verify", (post == null) ? sb.getConfig("search.verify", "iffresh") : post.get("verify", "iffresh"));
            prop.put("search.navigation", (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "all"));
            prop.put("contentdom", "text");
            prop.put("contentdomCheckText", "1");
            prop.put("contentdomCheckAudio", "0");
            prop.put("contentdomCheckVideo", "0");
            prop.put("contentdomCheckImage", "0");
            prop.put("contentdomCheckApp", "0");
            prop.put("excluded", "0");
            prop.put("results", "");
            prop.put("resultTable", "0");
            prop.put("num-results", searchAllowed ? "0" : "4");
            prop.put("num-results_totalcount", 0);
            prop.put("num-results_offset", 0);
            prop.put("num-results_itemsPerPage", 10);
            prop.put("geoinfo", "0");
            prop.put("rss_queryenc", "");
            prop.put("meanCount", 5);
            return prop;
        }

        // check for JSONP
        if (post.containsKey("callback")) {
          final String jsonp = post.get("callback")+ "([";
          prop.put("jsonp-start", jsonp);
          prop.put("jsonp-end", "])");
        } else {
          prop.put("jsonp-start", "");
          prop.put("jsonp-end", "");
        }

        // Adding CORS Access header for yacysearch.rss output
        if (rss) {
            final ResponseHeader outgoingHeader = new ResponseHeader();
            outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*");
            prop.setOutgoingHeader(outgoingHeader);
        }

        // collect search attributes

        int maximumRecords = Math.min((authenticated) ? (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline() ? 100 : 5000) : (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline() ? 20 : 1000), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative
        int startRecord = post.getInt("startRecord", post.getInt("offset", 0));

        boolean global = post.get("resource", "local").equals("global") && sb.peers.sizeConnected() > 0;
        final boolean indexof = (post != null && post.get("indexof","").equals("on"));

        final String originalUrlMask;
        if (post.containsKey("urlmaskfilter")) {
            originalUrlMask = post.get("urlmaskfilter", ".*");
        } else {
            originalUrlMask = ".*";
        }

        String prefermask = (post == null) ? "" : post.get("prefermaskfilter", "");
        if (!prefermask.isEmpty() && prefermask.indexOf(".*",0) < 0) {
            prefermask = ".*" + prefermask + ".*";
        }

        Bitfield constraint = (post != null && post.containsKey("constraint") && !post.get("constraint", "").isEmpty()) ? new Bitfield(4, post.get("constraint", "______")) : null;
        if (indexof) {
            constraint = new Bitfield(4);
            constraint.set(Condenser.flag_cat_indexof, true);
        }

        // SEARCH
        final boolean clustersearch = sb.isRobinsonMode() && (sb.getConfig("cluster.mode", "").equals("privatecluster") || sb.getConfig("cluster.mode", "").equals("publiccluster"));
        final boolean indexReceiveGranted = sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true) || sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true) || clustersearch;
        global = global && indexReceiveGranted; // if the user does not want indexes from remote peers, it cannot be a global searchnn

        // increase search statistic counter
        if (!global) {
            // we count only searches on the local peer here, because global searches
            // are counted on the target peer to preserve privacy of the searcher
            if (authenticated) {
                // local or authenticated search requests are counted separately
                // because they are not part of a public available peer statistic
                sb.searchQueriesRobinsonFromLocal++;
            } else {
                // robinson-searches from non-authenticated requests are public
                // and may be part of the public available statistic
                sb.searchQueriesRobinsonFromRemote++;
            }
        }

        // find search domain
        final ContentDomain contentdom = ContentDomain.contentdomParser(post == null ? "text" : post.get("contentdom", "text"));

        // patch until better search profiles are available
        if (contentdom == ContentDomain.TEXT) {
            if (maximumRecords > 50) maximumRecords = 10;
        } else {
            if (maximumRecords <= 32) maximumRecords = 64;
        }

        // check the search tracker
        TreeSet<Long> trackerHandles = sb.localSearchTracker.get(client);
        if (trackerHandles == null) {
            trackerHandles = new TreeSet<Long>();
        }
        boolean block = false;
        if (Domains.matchesList(client, sb.networkBlacklist)) {
            global = false;
            if (snippetFetchStrategy != null) {
                snippetFetchStrategy = null;
            }
            block = true;
            Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search");
        } else if (Domains.matchesList(client, sb.networkWhitelist)) {
            Log.logInfo("LOCAL_SEARCH", "ACCESS CONTROL: WHITELISTED CLIENT FROM " + client + " gets no search restrictions");
        } else if (!authenticated && !localhostAccess) {
            // in case that we do a global search or we want to fetch snippets, we check for DoS cases
            synchronized (trackerHandles) {
                final int accInThreeSeconds = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size();
                final int accInOneMinute = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size();
                final int accInTenMinutes = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size();
                // protections against too strong YaCy network load, reduces remote search
                if (global) {
                    if (accInTenMinutes >= 60 || accInOneMinute >= 6 || accInThreeSeconds >= 1) {
                        global = false;
                        Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed global search");
                    }
                }
                // protection against too many remote server snippet loads (protects traffic on server)
                if (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline()) {
                    if (accInTenMinutes >= 20 || accInOneMinute >= 4 || accInThreeSeconds >= 1) {
                        snippetFetchStrategy = CacheStrategy.CACHEONLY;
                        Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed remote snippet loading");
                    }
                }
                // general load protection
                if (accInTenMinutes >= 3000 || accInOneMinute >= 600 || accInThreeSeconds >= 60) {
                    block = true;
                    Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed search");
                }
            }
        }

        if (!block && (post == null || post.get("cat", "href").equals("href"))) {
            String urlmask = null;

            // check available memory and clean up if necessary
            if (!MemoryControl.request(8000000L, false)) {
                indexSegment.urlMetadata().clearCache();
                SearchEventCache.cleanupEvents(true);
            }

            final RankingProfile ranking = sb.getRanking();
            final StringBuilder modifier = new StringBuilder(20);

            if (querystring.indexOf("/near",0) >= 0) {
              querystring = querystring.replace("/near", "");
              ranking.coeff_worddistance = RankingProfile.COEFF_MAX;
              modifier.append("/near ");
            }
            if (querystring.indexOf("/date",0) >= 0) {
                querystring = querystring.replace("/date", "");
                ranking.coeff_date = RankingProfile.COEFF_MAX;
                modifier.append("/date ");
            }
            if (querystring.indexOf("/http",0) >= 0) {
                querystring = querystring.replace("/http", "");
                urlmask = "https?://.*";
                modifier.append("/http ");
            }
            if (querystring.indexOf("/https",0) >= 0) {
                querystring = querystring.replace("/https", "");
                urlmask = "https?://.*";
                modifier.append("/https ");
            }
            if (querystring.indexOf("/ftp",0) >= 0) {
                querystring = querystring.replace("/ftp", "");
                urlmask = "ftp://.*";
                modifier.append("/ftp ");
            }
            if (querystring.indexOf("/smb",0) >= 0) {
                querystring = querystring.replace("/smb", "");
                urlmask = "smb://.*";
                modifier.append("/smb ");
            }
            if (querystring.indexOf("/file",0) >= 0) {
                querystring = querystring.replace("/file", "");
                urlmask = "file://.*";
                modifier.append("/file ");
            }
            if (querystring.indexOf("/location",0) >= 0) {
                querystring = querystring.replace("/location", "");
                if (constraint == null) {
                    constraint = new Bitfield(4);
                }
                constraint.set(Condenser.flag_cat_haslocation, true);
                modifier.append("/location ");
            }
            final int lrp = querystring.indexOf("/language/",0);
            String language = "";
            if (lrp >= 0) {
                if (querystring.length() >= (lrp + 12)) {
                    language = querystring.substring(lrp + 10, lrp + 12);
                }
                querystring = querystring.replace("/language/" + language, "");
                language = language.toLowerCase();
                modifier.append("/language/").append(language).append(" ");
            }
            final int inurl = querystring.indexOf("inurl:",0);
            if (inurl >= 0) {
                int ftb = querystring.indexOf(' ', inurl);
                if (ftb == -1) {
                    ftb = querystring.length();
                }
                final String urlstr = querystring.substring(inurl + 6, ftb);
                querystring = querystring.replace("inurl:" + urlstr, "");
                if (!urlstr.isEmpty()) {
                    urlmask = urlmask == null ? ".*" + urlstr + ".*" : urlmask + urlstr + ".*";
                }
                modifier.append("inurl:").append(urlstr).append(" ");
            }
            final int filetype = querystring.indexOf("filetype:",0);
            if (filetype >= 0) {
                int ftb = querystring.indexOf(' ', filetype);
                if (ftb == -1) {
                    ftb = querystring.length();
                }
                String ft = querystring.substring(filetype + 9, ftb);
                querystring = querystring.replace("filetype:" + ft, "");
                while (!ft.isEmpty() && ft.charAt(0) == '.') ft = ft.substring(1);
                if (!ft.isEmpty()) {
                    if (urlmask == null) {
                        urlmask = ".*\\." + ft;
                    } else {
                        urlmask = urlmask + ".*\\." + ft;
                    }
                }
                modifier.append("filetype:").append(ft).append(" ");
            }
            String tenant = null;
            if (post.containsKey("tenant")) {
                tenant = post.get("tenant");
                if (tenant != null && tenant.isEmpty()) {
                    tenant = null;
                }
                if (tenant != null) {
                    if (urlmask == null) {
                        urlmask = ".*" + tenant + ".*";
                    } else urlmask = ".*" + tenant + urlmask;
                }
            }
            final int site = querystring.indexOf("site:",0);
            String sitehash = null;
            String sitehost = null;
            if (site >= 0) {
                int ftb = querystring.indexOf(' ', site);
                if (ftb == -1) {
                    ftb = querystring.length();
                }
                sitehost = querystring.substring(site + 5, ftb);
                querystring = querystring.replace("site:" + sitehost, "");
                while (sitehost.length() > 0 && sitehost.charAt(0) == '.') {
                    sitehost = sitehost.substring(1);
                }
                while (sitehost.endsWith(".")) {
                    sitehost = sitehost.substring(0, sitehost.length() - 1);
                }
                sitehash = DigestURI.hosthash(sitehost);
                modifier.append("site:").append(sitehost).append(" ");
            }

            final int heuristicScroogle = querystring.indexOf("/heuristic/scroogle",0);
            if (heuristicScroogle >= 0) {
                querystring = querystring.replace("/heuristic/scroogle", "");
                modifier.append("/heuristic/scroogle ");
            }

            final int heuristicBlekko = querystring.indexOf("/heuristic/blekko",0);
            if (heuristicBlekko >= 0) {
                querystring = querystring.replace("/heuristic/blekko", "");
                modifier.append("/heuristic/blekko ");
            }

            final int authori = querystring.indexOf("author:",0);
          String authorhash = null;
            if (authori >= 0) {
              // check if the author was given with single quotes or without
              final boolean quotes = (querystring.charAt(authori + 7) == (char) 39);
              String author;
              if (quotes) {
                    int ftb = querystring.indexOf((char) 39, authori + 8);
                    if (ftb == -1) {
                        ftb = querystring.length() + 1;
                    }
                    author = querystring.substring(authori + 8, ftb);
                    querystring = querystring.replace("author:'" + author + "'", "");
                    modifier.append("author:'").append(author).append("' ");
              } else {
                    int ftb = querystring.indexOf(' ', authori);
                    if (ftb == -1) {
                        ftb = querystring.length();
                    }
                    author = querystring.substring(authori + 7, ftb);
                    querystring = querystring.replace("author:" + author, "");
                    modifier.append("author:").append(author).append(" ");
              }
              authorhash = ASCII.String(Word.word2hash(author));
            }
            final int tld = querystring.indexOf("tld:",0);
            if (tld >= 0) {
                int ftb = querystring.indexOf(' ', tld);
                if (ftb == -1) {
                    ftb = querystring.length();
                }
                String domain = querystring.substring(tld + 4, ftb);
                querystring = querystring.replace("tld:" + domain, "");
                modifier.append("tld:").append(domain).append(" ");
                while (domain.length() > 0 && domain.charAt(0) == '.') {
                    domain = domain.substring(1);
                }
                if (domain.indexOf('.',0) < 0) {
                    domain = "\\." + domain;
                } // is tld
                if (domain.length() > 0) {
                    urlmask = "[a-zA-Z]*://[^/]*" + domain + "/.*" + ((urlmask != null) ? urlmask : "");
                }
            }
            if (urlmask == null || urlmask.isEmpty()) {
                urlmask = originalUrlMask;
            } //if no urlmask was given

            // read the language from the language-restrict option 'lr'
            // if no one is given, use the user agent or the system language as default
            language = (post == null) ? language : post.get("lr", language);
            if (language.startsWith("lang_")) {
                language = language.substring(5);
            }
            if (!ISO639.exists(language)) {
                // find out language of the user by reading of the user-agent string
                String agent = header.get(HeaderFramework.ACCEPT_LANGUAGE);
                if (agent == null) {
                    agent = System.getProperty("user.language");
                }
                language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent);
                if (language == null) {
                    language = "en";
                }
            }

            // navigation
            final String navigation = (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "");

            // the query
            final TreeSet<String>[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute

            final int maxDistance = (querystring.indexOf('"',0) >= 0) ? query.length - 1 : Integer.MAX_VALUE;

            // filter out stopwords
            final SortedSet<String> filtered = SetTools.joinConstructive(query[0], Switchboard.stopwords);
            if (!filtered.isEmpty()) {
                SetTools.excludeDestructive(query[0], Switchboard.stopwords);
            }

            // if a minus-button was hit, remove a special reference first
            if (post != null && post.containsKey("deleteref")) {
                try {
                    if (!sb.verifyAuthentication(header)) {
                        prop.put("AUTHENTICATE", "admin log-in"); // force log-in
                        return prop;
                    }

                    // delete the index entry locally
                    final String delHash = post.get("deleteref", ""); // urlhash
                    indexSegment.termIndex().remove(Word.words2hashesHandles(query[0]), delHash.getBytes());

                    // make new news message with negative voting
                    if (!sb.isRobinsonMode()) {
                        final Map<String, String> map = new HashMap<String, String>();
                        map.put("urlhash", delHash);
                        map.put("vote", "negative");
                        map.put("refid", "");
                        sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), NewsPool.CATEGORY_SURFTIPP_VOTE_ADD, map);
                    }

                    // delete the search history since this still shows the entry
                    SearchEventCache.delete(delHash);
                } catch (final IOException e) {
                    Log.logException(e);
                }
            }

            // if a plus-button was hit, create new voting message
            if (post != null && post.containsKey("recommendref")) {
                if (!sb.verifyAuthentication(header)) {
                    prop.put("AUTHENTICATE", "admin log-in"); // force log-in
                    return prop;
                }
                final String recommendHash = post.get("recommendref", ""); // urlhash
                final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash));
                if (urlentry != null) {
                    final URIMetadataRow.Components metadata = urlentry.metadata();
                    Document[] documents = null;
                    try {
                        documents = sb.loader.loadDocuments(sb.loader.request(metadata.url(), true, false), CacheStrategy.IFEXIST, 5000, Integer.MAX_VALUE);
                    } catch (final IOException e) {
                    } catch (final Parser.Failure e) {
                    }
                    if (documents != null) {
                        // create a news message
                        final Map<String, String> map = new HashMap<String, String>();
                        map.put("url", metadata.url().toNormalform(false, true).replace(',', '|'));
                        map.put("title", metadata.dc_title().replace(',', ' '));
                        map.put("description", documents[0].dc_title().replace(',', ' '));
                        map.put("author", documents[0].dc_creator());
                        map.put("tags", documents[0].dc_subject(' '));
                        sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), NewsPool.CATEGORY_SURFTIPP_ADD, map);
                        documents[0].close();
                    }
                }
            }

            // if a bookmarks-button was hit, create new bookmark entry
            if (post != null && post.containsKey("bookmarkref")) {
                if (!sb.verifyAuthentication(header)) {
                    prop.put("AUTHENTICATE", "admin log-in"); // force log-in
                    return prop;
                }
                final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
                final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(bookmarkHash));
                if (urlentry != null) {
                    final URIMetadataRow.Components metadata = urlentry.metadata();
                    try {
                        sb.tables.bookmarks.createBookmark(sb.loader, metadata.url(), YMarkTables.USER_ADMIN, true, "searchresult", "/search");
                    } catch (final Throwable e) {
                    }
                }
            }

            // do the search
            final HandleSet queryHashes = Word.words2hashesHandles(query[0]);
            final Pattern snippetPattern = QueryParams.stringSearchPattern(originalquerystring);

            // check filters
            try {
                Pattern.compile(urlmask);
            } catch (final PatternSyntaxException ex) {
                Log.logWarning("SEARCH", "Illegal URL mask, not a valid regex: " + urlmask);
                prop.put("urlmaskerror", 1);
                prop.putHTML("urlmaskerror_urlmask", urlmask);
                urlmask = ".*";
            }

            try {
                Pattern.compile(prefermask);
            } catch (final PatternSyntaxException ex) {
                Log.logWarning("SEARCH", "Illegal prefer mask, not a valid regex: " + prefermask);
                prop.put("prefermaskerror", 1);
                prop.putHTML("prefermaskerror_prefermask", prefermask);
                prefermask = "";
            }

            final QueryParams theQuery = new QueryParams(
                    originalquerystring,
                    queryHashes,
                    Word.words2hashesHandles(query[1]),
                    Word.words2hashesHandles(query[2]),
                    snippetPattern,
                    tenant,
                    modifier.toString().trim(),
                    maxDistance,
                    prefermask,
                    contentdom,
                    language,
                    navigation,
                    snippetFetchStrategy,
                    maximumRecords,
                    startRecord,
                    urlmask,
                    clustersearch && global ? QueryParams.Searchdom.CLUSTER :
                    (global && indexReceiveGranted ? QueryParams.Searchdom.GLOBAL : QueryParams.Searchdom.LOCAL),
                    20,
                    constraint,
                    true,
                    sitehash,
                    authorhash,
                    DigestURI.TLD_any_zone_filter,
                    client,
                    authenticated,
                    indexSegment,
                    ranking,
                    header.get(RequestHeader.USER_AGENT, ""),
                    sb.getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, false) && sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false) && sb.peers.mySeed().getFlagAcceptRemoteIndex());
            EventTracker.delete(EventTracker.EClass.SEARCH);
            EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theQuery.id(true), SearchEvent.Type.INITIALIZATION, "", 0, 0), false);

            // tell all threads to do nothing for a specific time
            sb.intermissionAllThreads(3000);

            // filter out words that appear in bluelist
            theQuery.filterOut(Switchboard.blueList);

            // log
            Log.logInfo("LOCAL_SEARCH", "INIT WORD SEARCH: " + theQuery.queryString + ":" + QueryParams.hashSet2hashString(theQuery.queryHashes) + " - " + theQuery.neededResults() + " links to be computed, " + theQuery.displayResults() + " lines to be displayed");
            EventChannel.channels(EventChannel.LOCALSEARCH).addMessage(new RSSMessage("Local Search Request", theQuery.queryString, ""));
            final long timestamp = System.currentTimeMillis();

            // create a new search event
            if (SearchEventCache.getEvent(theQuery.id(false)) == null) {
                theQuery.setOffset(0); // in case that this is a new search, always start without a offset
                startRecord = 0;
            }
            final SearchEvent theSearch = SearchEventCache.getEvent(
                theQuery, sb.peers, sb.tables, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader,
                (int) sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXCOUNT_USER, sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXCOUNT_DEFAULT, 10)),
                      sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXTIME_USER, sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXTIME_DEFAULT, 3000)),
                (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0),
                (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0));

            if (startRecord == 0) {
                if (sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated) {
                    sb.heuristicSite(theSearch, sitehost);
                }
                if ((heuristicScroogle >= || sb.getConfigBool("heuristic.scroogle", false)) && authenticated) {
                    sb.heuristicScroogle(theSearch);
                }
                if ((heuristicBlekko >= || sb.getConfigBool("heuristic.blekko", false)) && authenticated) {
                    sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko");
                }
            }

            // log
            Log.logInfo("LOCAL_SEARCH", "EXIT WORD SEARCH: " + theQuery.queryString + " - " +
                    "local-unfiltered(" + theSearch.getRankingResult().getLocalIndexCount() + "), " +
                    "local_miss(" + theSearch.getRankingResult().getMissCount() + "), " +
                    "local_sortout(" + theSearch.getRankingResult().getSortOutCount() + "), " +
                    "remote(" + theSearch.getRankingResult().getRemoteResourceSize() + ") links found, " +
                    (System.currentTimeMillis() - timestamp) + " ms");

            // prepare search statistics
            theQuery.resultcount = theSearch.getRankingResult().getLocalIndexCount() - theSearch.getRankingResult().getMissCount() - theSearch.getRankingResult().getSortOutCount() + theSearch.getRankingResult().getRemoteIndexCount();
            theQuery.searchtime = System.currentTimeMillis() - timestamp;
            theQuery.urlretrievaltime = theSearch.result().getURLRetrievalTime();
            theQuery.snippetcomputationtime = theSearch.result().getSnippetComputationTime();
            AccessTracker.add(AccessTracker.Location.local, theQuery);

            // check suggestions
            final int meanMax = (post != null) ? post.getInt("meanCount", 0) : 0;

            prop.put("meanCount", meanMax);
            if (meanMax > 0 && !json && !rss) {
                final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring));
              final Iterator<StringBuilder> meanIt = didYouMean.getSuggestions(100, 5).iterator();
                int meanCount = 0;
                String suggestion;
                while( meanCount<meanMax && meanIt.hasNext()) {
                    suggestion = meanIt.next().toString();
View Full Code Here

TOP

Related Classes of net.yacy.search.index.Segment$ReferenceCleaner

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.