Package net.yacy.kelondro.index

Examples of net.yacy.kelondro.index.HandleSet


            this.removeDelayedURLs.put(termHash, r);
        }
    }

    public void removeDelayed() throws IOException {
        final HandleSet words = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed.
        synchronized (this.removeDelayedURLs) {
            for (final byte[] b: this.removeDelayedURLs.keySet()) try {words.put(b);} catch (final RowSpaceExceededException e) {}
        }

        synchronized (this.removeDelayedURLs) {
            for (final byte[] b: words) {
                final HandleSet urls = this.removeDelayedURLs.remove(b);
                if (urls != null) remove(b, urls);
            }
        }
        this.countCache.clear();
    }
View Full Code Here


       
        private acceptRemoteIndexSeedEnum(SeedDB seedDB, final byte[] starthash, int max, boolean alsoMyOwn) {
            this.seedDB = seedDB;
            this.se = getDHTSeeds(seedDB, starthash, yacyVersion.YACY_HANDLES_COLLECTION_INDEX, alsoMyOwn);
            this.remaining = max;
            this.doublecheck = new HandleSet(12, Base64Order.enhancedCoder, 0);
            this.nextSeed = nextInternal();
            this.alsoMyOwn = alsoMyOwn;
        }
View Full Code Here

        public RemoveReducer(final HandleSet urlHashes) {
            this.urlHashes = urlHashes;
        }

        public RemoveReducer(final byte[] urlHashBytes) {
            this.urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
            try {
                this.urlHashes.put(urlHashBytes);
            } catch (final RowSpaceExceededException e) {
                Log.logException(e);
            }
View Full Code Here

        public Chunk(final byte[] primaryTarget, final List<Seed> targets) {
            super();
            this.primaryTarget = primaryTarget;
            this.containers = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength);
            this.references = new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
            this.badReferences = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
            this.targets    = targets;
            this.hit = 0;
            this.miss = 0;
        }
View Full Code Here

            // find all hashes that appear in the sentence
            final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null);
            final Iterator<byte[]> j = queryhashes.iterator();
            Integer pos;
            int p, minpos = sentence.length(), maxpos = -1;
            final HandleSet remainingHashes = new HandleSet(queryhashes.row().primaryKeyLength, queryhashes.comparator(), 0);
            while (j.hasNext()) {
                hash = j.next();
                pos = hs.get(hash);
                if (pos == null) {
                    try {
                        remainingHashes.put(hash);
                    } catch (RowSpaceExceededException e) {
                        Log.logException(e);
                    }
                } else {
                    p = pos.intValue();
View Full Code Here

        }

        // try to get the snippet from a document at the cache (or in the web)
        // this requires that the document is parsed after loading
        String textline = null;
        HandleSet remainingHashes = queryhashes;
        { //encapsulate potential expensive sentences
            Collection<StringBuilder> sentences = null;

            // try the solr text first
            if (solrText != null) {
                // compute sentences from solr query
                sentences = Document.getSentences(pre, new ByteArrayInputStream(UTF8.getBytes(solrText)));
            }

            // if then no sentences are found, we fail-over to get the content from the re-loaded document
            if (sentences == null) {
                final Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source);
                if (document == null) {
                    return;
                }

                // compute sentences from parsed document
                sentences = document.getSentences(pre);
                document.close();

                if (sentences == null) {
                    init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
                    return;
                }
            }

            if (this.resultStatus == ResultClass.SOURCE_METADATA) {
                // if we already know that there is a match then use the first lines from the text as snippet
                final StringBuilder s = new StringBuilder(snippetMaxLength);
                for (final StringBuilder t: sentences) {
                    s.append(t).append(' ');
                    if (s.length() >= snippetMaxLength / 4 * 3) break;
                }
                if (s.length() > snippetMaxLength) { s.setLength(snippetMaxLength); s.trimToSize(); }
                init(url.hash(), s.length() > 0 ? s.toString() : this.line, ResultClass.SOURCE_METADATA, null);
                return;
            }

            try {
                final SnippetExtractor tsr = new SnippetExtractor(sentences, queryhashes, snippetMaxLength);
                textline = tsr.getSnippet();
                remainingHashes =  tsr.getRemainingWords();
            } catch (final UnsupportedOperationException e) {
                init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
                return;
            }
        } //encapsulate potential expensive sentences END

        // compute snippet from media - attention document closed above!
        //String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
        //String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
        //String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
        //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
        //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);

        snippetLine = "";
        //if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
        //if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
        //if (appline   != null) line += (line.length() == 0) ? appline   : "<br />" + appline;
        //if (hrefline  != null) line += (line.length() == 0) ? hrefline  : "<br />" + hrefline;
        if (textline  != null) snippetLine += (snippetLine.length() == 0) ? textline  : "<br />" + textline;

        if (snippetLine == null || !remainingHashes.isEmpty()) {
            init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
            return;
        }
        if (snippetLine.length() > snippetMaxLength) snippetLine = snippetLine.substring(0, snippetMaxLength);

View Full Code Here

        }
        // then remove the container from the backend
        final ArrayList<ReferenceContainer<WordReference>> rc;
        if (ram) {
            // selection was only from ram, so we have to carefully remove only the selected entries
            final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
            Iterator<WordReference> it;
            for (final ReferenceContainer<WordReference> c: containers) {
                urlHashes.clear();
                it = c.entries();
                while (it.hasNext()) try { urlHashes.put(it.next().urlhash()); } catch (final RowSpaceExceededException e) { Log.logException(e); }
                if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + ASCII.String(c.getTermHash()) + "'");
                if (!urlHashes.isEmpty()) this.segment.termIndex().remove(c.getTermHash(), urlHashes);
            }
            rc = containers;
        } else {
            // selection was from whole index, so we can just delete the whole container
            // but to avoid race conditions return the results from the deletes
View Full Code Here

    public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException {
        System.out.println("INDEX DIFF URL-COL startup");
        final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile));
        final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
        final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000);
        System.out.println("INDEX DIFF URL-COL loaded dump, starting diff");
        final long start = System.currentTimeMillis();
        long update = start - 7000;
        int count = 0;
        for (final byte[] refhash: mr) {
            if (idx.get(refhash) == -1) {
                // the key exists as urlhash in the URL database, but not in the collection as referenced urlhash
                hs.put(refhash);
            }
            count++;
            if (System.currentTimeMillis() - update > 10000) {
                System.out.println("INDEX DIFF URL-COL running, checked " + count + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - count) / count) / 60000) + " minutes remaining");
                update = System.currentTimeMillis();
            }
        }
        idx.close();
        mr.close();
        System.out.println("INDEX DIFF URL-COL finished diff, starting dump to " + diffFile);
        count = hs.dump(new File(diffFile));
        System.out.println("INDEX DIFF URL-COL finished dump, wrote " + count + " references that occur in the URL-DB, but not in the collection-dump");
        return count;
    }
View Full Code Here

    public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException {
        // format: 0=text, 1=html, 2=rss/xml
        System.out.println("URL EXPORT startup");
        final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
        final HandleSet hs = (diffFile == null) ? null : new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
        System.out.println("URL EXPORT loaded dump, starting export");
        final Export e = mr.export(new File(export), ".*", hs, format, false);
        try {
            e.join();
        } catch (final InterruptedException e1) {
            Log.logException(e1);
        }
        System.out.println("URL EXPORT finished export, wrote " + ((hs == null) ? mr.size() : hs.size()) + " entries");
    }
View Full Code Here

    public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException {
        System.out.println("URL DELETE startup");
        final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
        final int mrSize = mr.size();
        final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
        System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);
        for (final byte[] refhash: hs) {
            mr.remove(refhash);
        }
        System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database");
    }
View Full Code Here

TOP

Related Classes of net.yacy.kelondro.index.HandleSet

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.