Package net.yacy.cora.document

Examples of net.yacy.cora.document.MultiProtocolURI


            parts.put("wordh", UTF8.StringBody(wordhashes));
            parts.put("lurlEntry", UTF8.StringBody(((entry == null) ? "" : crypt.simpleEncode(entry.toString(), salt))));
            // send request
            // final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"), 10000, target.getHexHash() + ".yacyh", parts);
            final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), 10000);
            final byte[] content = httpClient.POSTbytes(new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"), target.getHexHash() + ".yacyh", parts, false);
            return FileUtils.table(content);
        } catch (final Exception e) {
            // most probably a network time-out exception
            yacyCore.log.logWarning("yacyClient.crawlReceipt error:" + e.getMessage());
            return null;
View Full Code Here


            parts.put("wordc", UTF8.StringBody(Integer.toString(indexes.size())));
            parts.put("entryc", UTF8.StringBody(Integer.toString(indexcount)));
            parts.put("indexes", UTF8.StringBody(entrypost.toString()));
            // final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/transferRWI.html"), timeout, targetSeed.getHexHash() + ".yacyh", parts, gzipBody);
            final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), timeout);
            final byte[] content = httpClient.POSTbytes(new MultiProtocolURI("http://" + address + "/yacy/transferRWI.html"), targetSeed.getHexHash() + ".yacyh", parts, gzipBody);
            final Iterator<String> v = FileUtils.strings(content);
            // this should return a list of urlhashes that are unknown

            final Map<String, String> result = FileUtils.table(v);
            // return the transfered index data in bytes (for debugging only)
View Full Code Here

        }
        try {
            parts.put("urlc", UTF8.StringBody(Integer.toString(urlc)));
            // final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/transferURL.html"), timeout, targetSeed.getHexHash() + ".yacyh", parts, gzipBody);
            final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), timeout);
            final byte[] content = httpClient.POSTbytes(new MultiProtocolURI("http://" + address + "/yacy/transferURL.html"), targetSeed.getHexHash() + ".yacyh", parts, gzipBody);
            final Iterator<String> v = FileUtils.strings(content);

            final Map<String, String> result = FileUtils.table(v);
            // return the transfered url data in bytes (for debugging only)
            result.put("urlPayloadSize", Integer.toString(urlPayloadSize));
View Full Code Here

        if (address == null) { address = "localhost:8090"; }
        try {
            final Map<String,ContentBody> parts = yacyNetwork.basicRequestParts(Switchboard.getSwitchboard(), targetSeed.hash, salt);
            // final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/profile.html"), 5000, targetSeed.getHexHash() + ".yacyh", parts);
            final HTTPClient httpclient = new HTTPClient(ClientIdentification.getUserAgent(), 5000);
            final byte[] content = httpclient.POSTbytes(new MultiProtocolURI("http://" + address + "/yacy/profile.html"), targetSeed.getHexHash() + ".yacyh", parts, false);
            return FileUtils.table(content);
        } catch (final Exception e) {
            yacyCore.log.logWarning("yacyClient.getProfile error:" + e.getMessage());
            return null;
        }
View Full Code Here

                }
            System.exit(0);
        } else if(args.length == 1) {
            System.out.println("wput Test");
            // connection params
            MultiProtocolURI url = null;
            try {
                url = new MultiProtocolURI(args[0]);
            } catch (final MalformedURLException e) {
                Log.logException(e);
            }
            if (url == null) {
                System.exit(1);
                return;
            }
            final String vhost = url.getHost();
            final int timeout = 10000;
            // new data
            final Map<String,ContentBody> newpost = new LinkedHashMap<String,ContentBody>();
            newpost.put("process", UTF8.StringBody("permission"));
            newpost.put("purpose", UTF8.StringBody("crcon"));
View Full Code Here

            if (secondarySearchSuperviser != null) parts.put("abstracts", UTF8.StringBody("auto"));
            // resultMap = FileUtils.table(HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + hostaddress + "/yacy/search.html"), 60000, hostname, parts));
            //resultMap = FileUtils.table(HTTPConnector.getConnector(MultiProtocolURI.crawlerUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/search.html"), 60000, target.getHexHash() + ".yacyh", parts));

            final HTTPClient httpClient = new HTTPClient(ClientIdentification.getUserAgent(), 60000);
            resultMap = FileUtils.table(httpClient.POSTbytes(new MultiProtocolURI("http://" + hostaddress + "/yacy/search.html"), hostname, parts, false));

            // evaluate request result
            if (resultMap == null || resultMap.isEmpty()) throw new IOException("resultMap is NULL");
            try {
                this.searchtime = Integer.parseInt(resultMap.get("searchtime"));
View Full Code Here

            //System.out.println("*** Appended dot: " + b.toString());
        }
        // find http links inside text
        s = 0;
        String u;
        MultiProtocolURI url;
        while (s < b.length()) {
            p = find(b, dpssp, s);
            if (p == Integer.MAX_VALUE) break;
            s = Math.max(0, p - 5);
            p = find(b, protp, s);
            if (p == Integer.MAX_VALUE) break;
            q = b.indexOf(" ", p + 1);
            u = b.substring(p, q < 0 ? b.length() : q);
            if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
            s = p + 6;
            try {
                url = new MultiProtocolURI(u);
                this.anchors.put(url, new Properties());
                continue;
            } catch (final MalformedURLException e) {}
        }
        // append string to content
View Full Code Here

            final String src = tagopts.getProperty("src", "");
            try {
                final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
                final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
                if (src.length() > 0) {
                    final MultiProtocolURI url = absolutePath(src);
                    if (url != null) {
                        final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1);
                        addImage(this.images, ie);
                    }
                }
            } catch (final NumberFormatException e) {}
            this.evaluationScores.match(Element.imgpath, src);
        } else if(tagname.equalsIgnoreCase("base")) {
            try {
                this.root = new MultiProtocolURI(tagopts.getProperty("href", ""));
            } catch (final MalformedURLException e) {}
        } else if (tagname.equalsIgnoreCase("frame")) {
            final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
            this.anchors.put(src, tagopts /* with property "name" */);
            this.frames.add(src);
            this.evaluationScores.match(Element.framepath, src.toNormalform(true, false));
        } else if (tagname.equalsIgnoreCase("body")) {
            final String c = tagopts.getProperty("class", "");
            this.evaluationScores.match(Element.bodyclass, c);
        } else if (tagname.equalsIgnoreCase("div")) {
            final String id = tagopts.getProperty("id", "");
            this.evaluationScores.match(Element.divid, id);
        } else if (tagname.equalsIgnoreCase("meta")) {
            String name = tagopts.getProperty("name", "");
            final String content = tagopts.getProperty("content","");
            if (name.length() > 0) {
                this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
                if (name.equals("generator")) {
                    this.evaluationScores.match(Element.metagenerator, content);
                }
            } else {
                name = tagopts.getProperty("http-equiv", "");
                if (name.length() > 0) {
                    this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
                }
            }
        } else if (tagname.equalsIgnoreCase("area")) {
            final String areatitle = cleanLine(tagopts.getProperty("title",""));
            //String alt   = tagopts.getProperty("alt","");
            final String href  = tagopts.getProperty("href", "");
            final Properties p = new Properties(); p.put("name", areatitle);
            if (href.length() > 0) this.anchors.put(absolutePath(href), p);
        } else if (tagname.equalsIgnoreCase("link")) {
            final String href = tagopts.getProperty("href", "");
            final MultiProtocolURI newLink = absolutePath(href);

            if (newLink != null) {
                final String rel = tagopts.getProperty("rel", "");
                final String linktitle = tagopts.getProperty("title", "");
                final String type = tagopts.getProperty("type", "");
View Full Code Here

    public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
        // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
        if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
            final String href = tagopts.getProperty("href", "");
            MultiProtocolURI url;
            if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
                final String f = url.getFile();
                final int p = f.lastIndexOf('.');
                final String type = (p < 0) ? "" : f.substring(p + 1);
                if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) {
                    // special handling of such urls: put them to the image urls
                    final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
                    addImage(this.images, ie);
                } else {
                    tagopts.put("name", recursiveParse(text));
                    this.anchors.put(url, tagopts);
                }
            }
            this.evaluationScores.match(Element.apath, href);
        }
        final String h;
        if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) this.headlines[0].add(h);
        } else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) this.headlines[1].add(h);
        } else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) this.headlines[2].add(h);
        } else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) this.headlines[3].add(h);
        } else if ((tagname.equalsIgnoreCase("h5")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) this.headlines[4].add(h);
        } else if ((tagname.equalsIgnoreCase("h6")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) this.headlines[5].add(h);
        } else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
            this.title = recursiveParse(text);
        } else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) this.bold.inc(h);
        } else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) this.bold.inc(h);
        } else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) this.italic.inc(h);
        } else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
            h = recursiveParse(text);
            if (h.length() > 0) this.li.add(h);
        } else if (tagname.equalsIgnoreCase("iframe")) {
            final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
            this.anchors.put(src, tagopts /* with property "name" */);
            this.iframes.add(src);
            this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false));
        } else if (tagname.equalsIgnoreCase("script")) {
            final String src = tagopts.getProperty("src", "");
            if (src.length() > 0) {
                this.script.add(absolutePath(src));
                this.evaluationScores.match(Element.scriptpath, src);
            } else {
                this.evaluationScores.match(Element.scriptcode, text);
            }
View Full Code Here

        // load page
        final byte[] page = FileUtils.read(file);
        if (page == null) throw new IOException("no content in file " + file.toString());

        // scrape document to look up charset
        final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false);
        String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
        if(charset == null)
               charset = Charset.defaultCharset().toString();

        // scrape content
        final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"));
        final Writer writer = new TransformerWriter(null, null, scraper, null, false);
        FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));

        return scraper;
    }
View Full Code Here

TOP

Related Classes of net.yacy.cora.document.MultiProtocolURI

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.