Package net.yacy.kelondro.data.meta

Examples of net.yacy.kelondro.data.meta.URIMetadataRow


        int size = 0;
        boolean pre = false;

        // get the url hash from which the content should be loaded
        String urlHash = post.get("urlHash", "");
        URIMetadataRow urlEntry = null;
        // get the urlEntry that belongs to the url hash
        if (urlHash.length() > 0 && (urlEntry = indexSegment.urlMetadata().load(ASCII.getBytes(urlHash))) != null) {
            // get the url that belongs to the entry
            final URIMetadataRow.Components metadata = urlEntry.metadata();
            if ((metadata == null) || (metadata.url() == null)) {
                prop.put("error", "3");
                prop.put("viewMode", VIEW_MODE_NO_TEXT);
                return prop;
            }
            url = metadata.url();
            descr = metadata.dc_title();
            //urlEntry.wordCount();
            size = urlEntry.size();
            pre = urlEntry.flags().get(Condenser.flag_cat_indexof);
        }

        prop.put("error_inurldb", urlEntry == null ? 0 : 1);

        // alternatively, get the url simply from a url String
View Full Code Here


            int received = 0;
            int blocked = 0;
            final int sizeBefore = sb.indexSegments.urlMetadata(Segments.Process.DHTIN).size();
            // read the urls from the other properties and store
            String urls;
            URIMetadataRow lEntry;
            for (int i = 0; i < urlc; i++) {
                serverCore.checkInterruption();
               
                // read new lurl-entry
                urls = post.get("url" + i);
                if (urls == null) {
                    if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
                    blocked++;
                    continue;
                }

                // parse new lurl-entry
                lEntry = URIMetadataRow.importEntry(urls);
                if (lEntry == null) {
                    yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
                    blocked++;
                    continue;
                }
               
                // check if entry is well-formed
                final URIMetadataRow.Components metadata = lEntry.metadata();
                if (metadata == null || metadata.url() == null) {
                    yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls);
                    blocked++;
                    continue;
                }
               
                // check whether entry is too old
                if (lEntry.freshdate().getTime() <= freshdate) {
                    if (yacyCore.log.isFine()) yacyCore.log.logFine("transerURL: received too old URL from peer " + otherPeerName + ": " + lEntry.freshdate());
                    blocked++;
                    continue;
                }
               
                // check if the entry is blacklisted
                if ((blockBlacklist) && (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, metadata.url()))) {
                    if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName);
                    lEntry = null;
                    blocked++;
                    continue;
                }
               
                // check if the entry is in our network domain
                final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(metadata.url());
                if (urlRejectReason != null) {
                    if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: blocked URL '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + otherPeerName);
                    lEntry = null;
                    blocked++;
                    continue;
                }
               
                // write entry to database
                if (yacyCore.log.isFine()) yacyCore.log.logFine("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false));
                try {
                    sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry);
                    ResultURLs.stack(lEntry, iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER);
                    if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName);
                    received++;
View Full Code Here

                if (!sb.verifyAuthentication(header, true)) {
                    prop.put("AUTHENTICATE", "admin log-in"); // force log-in
                    return prop;
                }
                final String recommendHash = post.get("recommendref", ""); // urlhash
                final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash));
                if (urlentry != null) {
                    final URIMetadataRow.Components metadata = urlentry.metadata();
                    Document[] documents = null;
                    try {
                        documents = sb.loader.loadDocuments(sb.loader.request(metadata.url(), true, false), CacheStrategy.IFEXIST, 5000, Long.MAX_VALUE);
                    } catch (final IOException e) {
                    } catch (final Parser.Failure e) {
View Full Code Here

          prop.put("delay", "9999");
          return prop;
      }
       
        // generating a new loaded URL entry
        final URIMetadataRow entry = URIMetadataRow.importEntry(propStr);
        if (entry == null) {
            if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr);
            prop.put("delay", "3600");
            return prop;
        }
       
        final URIMetadataRow.Components metadata = entry.metadata();
        if (metadata.url() == null) {
            if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + ASCII.String(entry.hash()) + " from peer " + iam + "\n\tURL properties: "+ propStr);
            prop.put("delay", "3600");
            return prop;
        }
       
        // check if the entry is in our network domain
        final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(metadata.url());
        if (urlRejectReason != null) {
            if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + ASCII.String(entry.hash()) + " from peer " + iam + "\n\tURL properties: "+ propStr);
            prop.put("delay", "9999");
            return prop;
        }
       
        if ("fill".equals(result)) try {
            // put new entry into database
            sb.indexSegments.urlMetadata(Segments.Process.RECEIPTS).store(entry);
            ResultURLs.stack(entry, youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS);
            sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done
            if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + ASCII.String(entry.hash()) + ":" + metadata.url().toNormalform(false, true));

            // ready for more
            prop.put("delay", "10");
            return prop;
        } catch (final IOException e) {
            Log.logException(e);
            prop.put("delay", "3600");
            return prop;
        }

        sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case
        sb.crawlQueues.errorURL.push(
                entry.toBalancerEntry(iam),
                youare.getBytes(),
                null,
                0,
                FailCategory.FINAL_LOAD_CONTEXT,
                result + ":" + reason, -1);
View Full Code Here

            // retrieve a list of urls from the LURL-db by a given list of url hashes
            final String urlhashes = post.get("hashes", "");
            if (urlhashes.length() % 12 != 0) return prop;
            final int count = urlhashes.length() / 12;
          int c = 0;
          URIMetadataRow entry;
          URIMetadataRow.Components metadata;
            DigestURI referrer;
            for (int i = 0; i < count; i++) {
                entry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));
                if (entry == null) continue;
                // find referrer, if there is one
                referrer = sb.getURL(Segments.Process.PUBLIC, entry.referrerHash());
                // create RSS entry
                metadata = entry.metadata();
                prop.put("item_" + c + "_title", metadata.dc_title());
                prop.putXML("item_" + c + "_link", metadata.url().toNormalform(true, false));
                prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false));
                prop.putXML("item_" + c + "_description", metadata.dc_title());
                prop.put("item_" + c + "_author", metadata.dc_creator());
                prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate()));
                prop.put("item_" + c + "_guid", ASCII.String(entry.hash()));
                c++;
            }
            prop.put("item", c);
            prop.putXML("response", "ok");
        }
View Full Code Here

            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urlhashdelete")) {
            final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
            if (entry == null) {
                prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
            } else {
                urlstring = entry.metadata().url().toNormalform(false, true);
                prop.put("urlstring", "");
                sb.urlRemove(segment, urlhash.getBytes());
                prop.putHTML("result", "Removed URL " + urlstring);
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urldelete")) {
            try {
                urlhash = ASCII.String((new DigestURI(urlstring)).hash());
            } catch (final MalformedURLException e) {
                urlhash = null;
            }
            if ((urlhash == null) || (urlstring == null)) {
                prop.put("result", "No input given; nothing deleted.");
            } else {
                sb.urlRemove(segment, urlhash.getBytes());
                prop.putHTML("result", "Removed URL " + urlstring);
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urlstringsearch")) {
            try {
                final DigestURI url = new DigestURI(urlstring);
                urlhash = ASCII.String(url.hash());
                prop.put("urlhash", urlhash);
                final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
                if (entry == null) {
                    prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true));
                    prop.putHTML("urlstring", urlstring);
                    prop.put("urlhash", "");
                } else {
                    prop.putAll(genUrlProfile(segment, entry, urlhash));
                    prop.put("statistics", 0);
                }
            } catch (final MalformedURLException e) {
                prop.putHTML("result", "bad url: " + urlstring);
                prop.put("urlhash", "");
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urlhashsearch")) {
            final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
            if (entry == null) {
                prop.putHTML("result", "No Entry for URL hash " + urlhash);
            } else {
                prop.putHTML("urlstring", entry.metadata().url().toNormalform(false, true));
                prop.putAll(genUrlProfile(segment, entry, urlhash));
                prop.put("statistics", 0);
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        // generate list
        if (post.containsKey("urlhashsimilar")) {
            try {
                final Iterator<URIMetadataRow> entryIt = new RotateIterator<URIMetadataRow>(segment.urlMetadata().entries(true, urlhash), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax());
                final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
                URIMetadataRow entry;
                i = 0;
                int rows = 0, cols = 0;
                prop.put("urlhashsimilar", "1");
                while (entryIt.hasNext() && i < 256) {
                    entry = entryIt.next();
                    if (entry == null) break;
                    prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", ASCII.String(entry.hash()));
                    cols++;
                    if (cols==8) {
                        prop.put("urlhashsimilar_rows_"+rows+"_cols", cols);
                        cols = 0;
                        rows++;
View Full Code Here

            prop.put("genUrlProfile", "1");
            prop.put("genUrlProfile_urlhash", urlhash);
            return prop;
        }
        final URIMetadataRow.Components metadata = entry.metadata();
        final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
        if (metadata == null || metadata.url() == null) {
            prop.put("genUrlProfile", "1");
            prop.put("genUrlProfile_urlhash", urlhash);
            return prop;
        }
        prop.put("genUrlProfile", "2");
        prop.putHTML("genUrlProfile_urlNormalform", metadata.url().toNormalform(false, true));
        prop.put("genUrlProfile_urlhash", urlhash);
        prop.put("genUrlProfile_urlDescr", metadata.dc_title());
        prop.put("genUrlProfile_moddate", entry.moddate().toString());
        prop.put("genUrlProfile_loaddate", entry.loaddate().toString());
        prop.put("genUrlProfile_referrer", (le == null) ? 0 : 1);
        prop.putHTML("genUrlProfile_referrer_url", (le == null) ? "<unknown>" : le.metadata().url().toNormalform(false, true));
        prop.put("genUrlProfile_referrer_hash", (le == null) ? "" : ASCII.String(le.hash()));
        prop.put("genUrlProfile_doctype", String.valueOf(entry.doctype()));
        prop.put("genUrlProfile_language", entry.language());
        prop.put("genUrlProfile_size", entry.size());
        prop.put("genUrlProfile_wordCount", entry.wordCount());
        return prop;
View Full Code Here

                Log.logException(e);
            }
        }
        if (urlhash == null || urlhash.length() == 0) return prop;
       
        final URIMetadataRow entry = segment.urlMetadata().load(urlhash.getBytes());
        if (entry == null) return prop;

        final URIMetadataRow.Components metadata = entry.metadata();
        if (metadata.url() == null) {
            return prop;
        }
        final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
       
        prop.putXML("dc_title", metadata.dc_title());
        prop.putXML("dc_creator", metadata.dc_creator());
        prop.putXML("dc_description", ""); // this is the fulltext part in the surrogate
        prop.putXML("dc_subject", metadata.dc_subject());
        prop.putXML("dc_publisher", metadata.dc_publisher());
        prop.putXML("dc_contributor", "");
        prop.putXML("dc_date", ISO8601Formatter.FORMATTER.format(entry.moddate()));
        prop.putXML("dc_type", String.valueOf(entry.doctype()));
        prop.putXML("dc_identifier", metadata.url().toNormalform(false, true));
        prop.putXML("dc_language", UTF8.String(entry.language()));
        prop.put("geo_lat", metadata.lat());
        prop.put("geo_long", metadata.lon());

        prop.put("yacy_urlhash", metadata.url().hash());
        prop.putXML("yacy_loaddate", entry.loaddate().toString());
        prop.putXML("yacy_referrer_hash", (le == null) ? "" : ASCII.String(le.hash()));
        prop.putXML("yacy_referrer_url", (le == null) ? "" : le.metadata().url().toNormalform(false, true));
        prop.put("yacy_size", entry.size());
        prop.put("yacy_words",entry.wordCount());
       
        // return rewrite properties
        return prop;
View Full Code Here

            prop.put("table_showURL", (showURL) ? "1" : "0");

            boolean dark = true;
            String urlstr, urltxt;
            yacySeed initiatorSeed, executorSeed;
            URIMetadataRow urle;
            URIMetadataRow.Components metadata;

            int cnt = 0;
            final Iterator<Map.Entry<String, InitExecEntry>> i = ResultURLs.results(tabletype);
            Map.Entry<String, InitExecEntry> entry;
            while (i.hasNext()) {
                entry = i.next();
                try {
                    urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(UTF8.getBytes(entry.getKey()));
                    if (urle == null) {
                        Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash " + entry.getKey());
                        urlstr = null;
                        urltxt = null;
                        metadata = null;
                        continue;
                    }
                    metadata = urle.metadata();
                    urlstr = metadata.url().toNormalform(false, true);
                    urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
                   
                    initiatorSeed = entry.getValue() == null || entry.getValue().initiatorHash == null ? null : sb.peers.getConnected(ASCII.String(entry.getValue().initiatorHash));
                    executorSeed = entry.getValue() == null || entry.getValue().executorHash == null ? null : sb.peers.getConnected(ASCII.String(entry.getValue().executorHash));

                    prop.put("table_indexed_" + cnt + "_dark", (dark) ? "1" : "0");
                    prop.put("table_indexed_" + cnt + "_feedbackpage", "CrawlResults.html");
                    prop.put("table_indexed_" + cnt + "_tabletype", tabletype.getCode());
                    prop.put("table_indexed_" + cnt + "_urlhash", entry.getKey());

                    if (showInit) {
                        prop.put("table_indexed_" + cnt + "_showInit", "1");
                        prop.put("table_indexed_" + cnt + "_showInit_initiatorSeed", (initiatorSeed == null) ? "unknown" : initiatorSeed.getName());
                    } else
                        prop.put("table_indexed_" + cnt + "_showInit", "0");

                    if (showExec) {
                        prop.put("table_indexed_" + cnt + "_showExec", "1");
                        prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? "unknown" : executorSeed.getName());
                    } else
                        prop.put("table_indexed_" + cnt + "_showExec", "0");

                    if (showDate && urle != null) {
                        prop.put("table_indexed_" + cnt + "_showDate", "1");
                        prop.put("table_indexed_" + cnt + "_showDate_modified", daydate(urle.moddate()));
                    } else
                        prop.put("table_indexed_" + cnt + "_showDate", "0");

                    if (showWords && urle != null) {
                        prop.put("table_indexed_" + cnt + "_showWords", "1");
                        prop.put("table_indexed_" + cnt + "_showWords_count", urle.wordCount());
                    } else
                        prop.put("table_indexed_" + cnt + "_showWords", "0");

                    if (showTitle) {
                        prop.put("table_indexed_" + cnt + "_showTitle", (showTitle) ? "1" : "0");
View Full Code Here

                // built urlCache
                final Iterator<WordReference> urlIter = index.entries();
                final TreeMap<byte[], URIMetadataRow> knownURLs = new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
                final HandleSet unknownURLEntries = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, index.size());
                Reference iEntry;
                URIMetadataRow lurl;
                while (urlIter.hasNext()) {
                    iEntry = urlIter.next();
                    lurl = segment.urlMetadata().load(iEntry.urlhash());
                    if (lurl == null) {
                        try {
                            unknownURLEntries.put(iEntry.urlhash());
                        } catch (final RowSpaceExceededException e) {
                            Log.logException(e);
                        }
                        urlIter.remove();
                    } else {
                        knownURLs.put(iEntry.urlhash(), lurl);
                    }
                }

                // make an indexContainerCache
                final ReferenceContainerCache<WordReference> icc = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength);
                try {
                    icc.add(index);
                } catch (final RowSpaceExceededException e) {
                    Log.logException(e);
                }

                // transport to other peer
                final boolean gzipBody = sb.getConfigBool("indexControl.gzipBody", false);
                final int timeout = (int) sb.getConfigLong("indexControl.timeout", 60000);
                final String error = yacyClient.transferIndex(
                             seed,
                             icc,
                             knownURLs,
                             gzipBody,
                             timeout);
                prop.put("result", (error == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries.size() + " URL not found") : "error: " + error);
                index = null;
            } catch (final IOException e) {
                Log.logException(e);
            }

            // generate list
            if (post.containsKey("keyhashsimilar")) try {
                final Iterator<ReferenceContainer<WordReference>> containerIt = segment.termIndex().references(keyhash, true, 256, false).iterator();
                    ReferenceContainer<WordReference> container;
                    i = 0;
                    int rows = 0, cols = 0;
                    prop.put("keyhashsimilar", "1");
                    while (containerIt.hasNext() && i < 256) {
                        container = containerIt.next();
                        prop.put("keyhashsimilar_rows_"+rows+"_cols_"+cols+"_wordHash", container.getTermHash());
                        cols++;
                        if (cols==8) {
                            prop.put("keyhashsimilar_rows_"+rows+"_cols", cols);
                            cols = 0;
                            rows++;
                        }
                        i++;
                    }
                    prop.put("keyhashsimilar_rows_"+rows+"_cols", cols);
                    prop.put("keyhashsimilar_rows", rows + 1);
                    prop.put("result", "");
            } catch (final IOException e) {
                Log.logException(e);
            }

            if (post.containsKey("blacklist")) {
                final String blacklist = post.get("blacklist", "");
                final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, urlb.size());
                if (post.containsKey("blacklisturls")) {
                    PrintWriter pw;
                    try {
                        final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(",");
                        pw = new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true));
                        DigestURI url;
                        for (final byte[] b: urlb) {
                            try {
                                urlHashes.put(b);
                            } catch (final RowSpaceExceededException e) {
                                Log.logException(e);
                            }
                            final URIMetadataRow e = segment.urlMetadata().load(b);
                            segment.urlMetadata().remove(b);
                            if (e != null) {
                                url = e.metadata().url();
                                pw.println(url.getHost() + "/" + url.getFile());
                                for (final String supportedBlacklistType : supportedBlacklistTypes) {
                                    if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist)) {
                                        Switchboard.urlBlacklist.add(
                                                supportedBlacklistType,
                                                url.getHost(),
                                                url.getFile());
                                    }
                                }
                                SearchEventCache.cleanupEvents(true);
                            }
                        }
                        pw.close();
                    } catch (final IOException e) {
                    }
                }

                if (post.containsKey("blacklistdomains")) {
                    PrintWriter pw;
                    try {
                        final String[] supportedBlacklistTypes = Blacklist.BLACKLIST_TYPES_STRING.split(",");
                        pw = new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true));
                        DigestURI url;
                        for (final byte[] b: urlb) {
                            try {
                                urlHashes.put(b);
                            } catch (final RowSpaceExceededException e) {
                                Log.logException(e);
                            }
                            final URIMetadataRow e = segment.urlMetadata().load(b);
                            segment.urlMetadata().remove(b);
                            if (e != null) {
                                url = e.metadata().url();
                                pw.println(url.getHost() + "/.*");
                                for (final String supportedBlacklistType : supportedBlacklistTypes) {
                                    if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist)) {
                                        Switchboard.urlBlacklist.add(
                                                supportedBlacklistType,
View Full Code Here

TOP

Related Classes of net.yacy.kelondro.data.meta.URIMetadataRow

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.