Package net.yacy.kelondro.data.word

Examples of net.yacy.kelondro.data.word.WordReferenceVars


        this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0")));
        this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null);
        this.word = null;
        if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported");
        if (prop.containsKey("wi")) {
            this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))));
        }
        this.ranking = 0;
        this.comp = null;
    }
View Full Code Here


        timer = System.currentTimeMillis();
        final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;

        // apply all constraints
        try {
            WordReferenceVars iEntry;
            final String pattern = this.query.urlMask.pattern();
            final boolean httpPattern = pattern.equals("http://.*");
            final boolean noHttpButProtocolPattern = pattern.equals("https://.*") || pattern.equals("ftp://.*") || pattern.equals("smb://.*") || pattern.equals("file://.*");
            pollloop: while (true) {
                iEntry = decodedEntries.poll(1, TimeUnit.SECONDS);
                if (iEntry == null || iEntry == WordReferenceVars.poison) break pollloop;
                assert (iEntry.urlhash().length == index.row().primaryKeyLength);
                //if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;

                // increase flag counts
                for (int j = 0; j < 32; j++) {
                    if (iEntry.flags().get(j)) {this.flagcount[j]++;}
                }

                // check constraints
                if (!testFlags(iEntry)) {
                    continue pollloop;
                }

                // check document domain
                if (this.query.contentdom != ContentDomain.TEXT) {
                    if ((this.query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { continue pollloop; }
                    if ((this.query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { continue pollloop; }
                    if ((this.query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { continue pollloop; }
                    if ((this.query.contentdom == ContentDomain.APP  ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp  )))) { continue pollloop; }
                }

                // check tld domain
                /*
                if ((DigestURI.domDomain(iEntry.metadataHash()) & this.query.zonecode) == 0) {
                    // filter out all tld that do not match with wanted tld domain
                    this.sortout++;
                    continue;
                }
                */

                // count domZones
                //this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;

                // check site constraints
                final String hosthash = iEntry.hosthash();
                if (this.query.sitehash == null) {
                    // no site constraint there; maybe collect host navigation information
                    if (nav_hosts && this.query.urlMask_isCatchall) {
                        this.hostNavigator.inc(hosthash);
                        this.hostResolver.put(hosthash, iEntry.urlhash());
                    }
                } else {
                    if (!hosthash.equals(this.query.sitehash)) {
                        // filter out all domains that do not match with the site constraint
                        continue pollloop;
                    }
                }

                // check protocol
                if (!this.query.urlMask_isCatchall) {
                    final boolean httpFlagSet = DigestURI.flag4HTTPset(iEntry.urlHash);
                    if (httpPattern && !httpFlagSet) continue pollloop;
                    if (noHttpButProtocolPattern && httpFlagSet) continue pollloop;
                }

                // finally make a double-check and insert result to stack
                if (this.urlhashes.add(iEntry.urlhash())) {
                    rankingtryloop: while (true) {
                        try {
                            this.stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
                            break rankingtryloop;
                        } catch (final ArithmeticException e) {
View Full Code Here

        }

        public void run() {
            Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
            ReferenceContainer<WordReference> container = null;
            WordReferenceVars entry = null;
            DigestURI url = null;
            final HandleSet urlHashs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
            try {
                Iterator<ReferenceContainer<WordReference>> indexContainerIterator = Segment.this.termIndex.references(this.startHash, false, 100, false).iterator();
                while (indexContainerIterator.hasNext() && this.run) {
                    waiter();
                    container = indexContainerIterator.next();
                    final Iterator<WordReference> containerIterator = container.entries();
                    this.wordHashNow = container.getTermHash();
                    while (containerIterator.hasNext() && this.run) {
                        waiter();
                        entry = new WordReferenceVars(containerIterator.next());
                        // System.out.println("Wordhash: "+wordHash+" UrlHash:
                        // "+entry.getUrlHash());
                        final URIMetadataRow ue = Segment.this.urlMetadata.load(entry.urlhash());
                        if (ue == null) {
                            urlHashs.put(entry.urlhash());
                        } else {
                            url = ue.metadata().url();
                            if (url == null || Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url)) {
                                urlHashs.put(entry.urlhash());
                            }
                        }
                    }
                    if (!urlHashs.isEmpty()) try {
                        final int removed = Segment.this.termIndex.remove(container.getTermHash(), urlHashs);
View Full Code Here

                worker[i] = new NormalizeWorker(this.out, termination);
                worker[i].start();
            }

            // fill the queue
            WordReferenceVars iEntry;
            int p = 0;
            try {
                while ((iEntry = vars.take()) != WordReferenceVars.poison) {
                    worker[p % this.threads].add(iEntry);
                    p++;
View Full Code Here

            }
        }

        public void run() {
            try {
                WordReferenceVars iEntry;
                final Map<String, Integer> doms0 = new HashMap<String, Integer>();
                String dom;
                Integer count;
                final Integer int1 = 1;
                while ((iEntry = this.decodedEntries.take()) != WordReferenceVars.poison) {
                    // find min/max
                    if (ReferenceOrder.this.min == null) ReferenceOrder.this.min = iEntry.clone(); else ReferenceOrder.this.min.min(iEntry);
                    if (ReferenceOrder.this.max == null) ReferenceOrder.this.max = iEntry.clone(); else ReferenceOrder.this.max.max(iEntry);
                    this.out.put(iEntry); // must be after the min/max check to prevent that min/max is null in cardinal()
                    // update domcount
                    dom = iEntry.hosthash();
                    count = doms0.get(dom);
                    if (count == null) {
                        doms0.put(dom, int1);
                    } else {
                        doms0.put(dom, LargeNumberCache.valueOf(count.intValue() + 1));
View Full Code Here

        this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0")));
        this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null);
        this.word = null;
        if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported");
        if (prop.containsKey("wi")) {
            this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))));
        }
        this.ranking = 0;
        this.comp = null;
    }
View Full Code Here

        timer = System.currentTimeMillis();
        final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts",0) >= 0;

        // apply all constraints
        try {
            WordReferenceVars iEntry;
            final String pattern = this.query.urlMask.pattern();
            final boolean httpPattern = pattern.equals("http://.*");
            final boolean noHttpButProtocolPattern = pattern.equals("https://.*") || pattern.equals("ftp://.*") || pattern.equals("smb://.*") || pattern.equals("file://.*");
            pollloop: while (true) {
                iEntry = decodedEntries.poll(1, TimeUnit.SECONDS);
                if (iEntry == null || iEntry == WordReferenceVars.poison) break pollloop;
                assert (iEntry.urlhash().length == index.row().primaryKeyLength);
                //if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;

                // increase flag counts
                for (int j = 0; j < 32; j++) {
                    if (iEntry.flags().get(j)) {this.flagcount[j]++;}
                }

                // check constraints
                if (!testFlags(iEntry)) {
                    continue pollloop;
                }

                // check document domain
                if (this.query.contentdom != ContentDomain.TEXT) {
                    if ((this.query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { continue pollloop; }
                    if ((this.query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { continue pollloop; }
                    if ((this.query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { continue pollloop; }
                    if ((this.query.contentdom == ContentDomain.APP  ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp  )))) { continue pollloop; }
                }

                // check tld domain
                /*
                if ((DigestURI.domDomain(iEntry.metadataHash()) & this.query.zonecode) == 0) {
                    // filter out all tld that do not match with wanted tld domain
                    this.sortout++;
                    continue;
                }
                */

                // count domZones
                //this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;

                // check site constraints
                final String hosthash = iEntry.hosthash();
                if (this.query.sitehash == null) {
                    // no site constraint there; maybe collect host navigation information
                    if (nav_hosts && this.query.urlMask_isCatchall) {
                        this.hostNavigator.inc(hosthash);
                        this.hostResolver.put(hosthash, iEntry.urlhash());
                    }
                } else {
                    if (!hosthash.equals(this.query.sitehash)) {
                        // filter out all domains that do not match with the site constraint
                        continue pollloop;
                    }
                }

                // check protocol
                if (!this.query.urlMask_isCatchall) {
                    final boolean httpFlagSet = DigestURI.flag4HTTPset(iEntry.urlHash);
                    if (httpPattern && !httpFlagSet) continue pollloop;
                    if (noHttpButProtocolPattern && httpFlagSet) continue pollloop;
                }

                // finally make a double-check and insert result to stack
                // the url hashes should be unique, no reason to check that
                //if (!this.urlhashes.has(iEntry.urlhash())) {
                    this.urlhashes.putUnique(iEntry.urlhash());
                    rankingtryloop: while (true) {
                        try {
                            this.stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
                            break rankingtryloop;
                        } catch (final ArithmeticException e) {
View Full Code Here

        }

        public void run() {
            Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
            ReferenceContainer<WordReference> container = null;
            WordReferenceVars entry = null;
            DigestURI url = null;
            final HandleSet urlHashs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
            try {
                Iterator<ReferenceContainer<WordReference>> indexContainerIterator = Segment.this.termIndex.referenceContainer(this.startHash, false, 100, false).iterator();
                while (indexContainerIterator.hasNext() && this.run) {
                    waiter();
                    container = indexContainerIterator.next();
                    final Iterator<WordReference> containerIterator = container.entries();
                    this.wordHashNow = container.getTermHash();
                    while (containerIterator.hasNext() && this.run) {
                        waiter();
                        entry = new WordReferenceVars(containerIterator.next());
                        // System.out.println("Wordhash: "+wordHash+" UrlHash:
                        // "+entry.getUrlHash());
                        final URIMetadataRow ue = Segment.this.urlMetadata.load(entry.urlhash());
                        if (ue == null) {
                            urlHashs.put(entry.urlhash());
                        } else {
                            url = ue.metadata().url();
                            if (url == null || Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url)) {
                                urlHashs.put(entry.urlhash());
                            }
                        }
                    }
                    if (!urlHashs.isEmpty()) try {
                        final int removed = Segment.this.termIndex.remove(container.getTermHash(), urlHashs);
View Full Code Here

                worker[i] = new NormalizeWorker(this.out, termination);
                worker[i].start();
            }

            // fill the queue
            WordReferenceVars iEntry;
            int p = 0;
            try {
                while ((iEntry = vars.take()) != WordReferenceVars.poison) {
                    worker[p % this.threads].add(iEntry);
                    p++;
View Full Code Here

            }
        }

        public void run() {
            try {
                WordReferenceVars iEntry;
                final Map<String, Integer> doms0 = new HashMap<String, Integer>();
                String dom;
                Integer count;
                final Integer int1 = 1;
                while ((iEntry = this.decodedEntries.take()) != WordReferenceVars.poison) {
                    // find min/max
                    if (ReferenceOrder.this.min == null) ReferenceOrder.this.min = iEntry.clone(); else ReferenceOrder.this.min.min(iEntry);
                    if (ReferenceOrder.this.max == null) ReferenceOrder.this.max = iEntry.clone(); else ReferenceOrder.this.max.max(iEntry);
                    this.out.put(iEntry); // must be after the min/max check to prevent that min/max is null in cardinal()
                    // update domcount
                    dom = iEntry.hosthash();
                    count = doms0.get(dom);
                    if (count == null) {
                        doms0.put(dom, int1);
                    } else {
                        doms0.put(dom, LargeNumberCache.valueOf(count.intValue() + 1));
View Full Code Here

TOP

Related Classes of net.yacy.kelondro.data.word.WordReferenceVars

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.