Examples of MultiProtocolURI


Examples of net.yacy.cora.document.MultiProtocolURI

                 if (this.cfos != null) {
                     // parse the file
                     Document[] theDocs;
                     // workaround for relative links in file, normally '#' shall be used behind the location, see
                     // below for reversion of the effects
                     final MultiProtocolURI url = MultiProtocolURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
                     final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
                     theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());
                    
                     this.doc.addSubDocuments(theDocs);
                 }
View Full Code Here

Examples of net.yacy.cora.document.MultiProtocolURI

        return encoding;
    }
    public static void main(String[] args) {
        // test parsing of a url
        MultiProtocolURI url;
        try {
            url = new MultiProtocolURI(args[0]);
            byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
            Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
            String title = document[0].dc_title();
            System.out.println(title);
            System.out.println(CharacterCoding.unicode2html(title, false));
        } catch (MalformedURLException e) {
View Full Code Here

Examples of net.yacy.cora.document.MultiProtocolURI

        } catch (IOException e) {
            throw new Parser.Failure("Load error:" + e.getMessage(), url);
        }
       
        final List<Document> docs = new ArrayList<Document>();
        MultiProtocolURI uri;
        Document doc;
        for (final URLEntry item: sitemap) try {
            uri = new MultiProtocolURI(item.loc);
            doc = new Document(
                    uri,
                    TextParser.mimeOf(url),
                    charset,
                    this,
View Full Code Here

Examples of net.yacy.cora.document.MultiProtocolURI

            }

            // images
            final Iterator<ImageEntry> j = document.getImages().values().iterator();
            ImageEntry ientry;
            MultiProtocolURI url;
            while (j.hasNext()) {
                ientry = j.next();
                url = ientry.url();
                if (url == null) continue;
                insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib);
                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib);
            }
       
            // finally check all words for missing flag entry
            final Iterator<Map.Entry<String, Word>> k = words.entrySet().iterator();
View Full Code Here

Examples of net.yacy.cora.document.MultiProtocolURI

   
    public static void main(String[] args) {
        try {
            byte[] b = FileUtils.read(new File(args[0]));
            torrentParser parser = new torrentParser();
            Document[] d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
            Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib);
            Map<String, Word> w = c.words();
            for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
        } catch (IOException e) {
            e.printStackTrace();
View Full Code Here

Examples of net.yacy.cora.document.MultiProtocolURI

           
        int importCount = 0;
       
        Map<MultiProtocolURI, Properties> links = new HashMap<MultiProtocolURI, Properties>();
        String title;
        MultiProtocolURI url;
        Bookmark bm;
        final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
        try {
            //load the links
            final ContentScraper scraper = new ContentScraper(baseURL);        
            //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
            final Writer writer= new TransformerWriter(null,null,scraper, null, false);
            FileUtils.copy(input,writer);
            writer.close();
            links = scraper.getAnchors();          
        } catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
        for (final Entry<MultiProtocolURI, Properties> link: links.entrySet()) {
            url = link.getKey();
            title = link.getValue().getProperty("name", "");
            Log.logInfo("BOOKMARKS", "links.get(url)");
            if ("".equals(title)) {//cannot be displayed
                title = url.toString();
            }
            bm = db.new Bookmark(url.toString());
            bm.setProperty(Bookmark.BOOKMARK_TITLE, title);
            bm.setTags(tags);
            bm.setPublic(importPublic);
            db.saveBookmark(bm);
           
View Full Code Here

Examples of net.yacy.cora.document.MultiProtocolURI

                        }
                        parsedDataText.append("\r\n");
                        parsedData.clear();
                    } else if (key.toUpperCase().startsWith("URL")) {
                        try {
                            final MultiProtocolURI newURL = new MultiProtocolURI(value);
                            Properties p = new Properties();
                            p.put("name", newURL.toString());
                            anchors.put(newURL, p);  
                            //parsedData.put(key,value);
                        } catch (final MalformedURLException ex) {/* ignore this */}                                               
                    } else if (
                            !key.equalsIgnoreCase("BEGIN") &&
View Full Code Here

Examples of net.yacy.cora.document.MultiProtocolURI

    private void resortLinks() {
        if (this.resorted) return;
        synchronized (this) {
            if (this.resorted) return;
            // extract hyperlinks, medialinks and emaillinks from anchorlinks
            MultiProtocolURI url;
            String u;
            int extpos, qpos;
            String ext = null;
            final String thishost = this.source.getHost();
            this.inboundlinks = new HashMap<MultiProtocolURI, String>();
            this.outboundlinks = new HashMap<MultiProtocolURI, String>();
            this.hyperlinks = new HashMap<MultiProtocolURI, String>();
            this.videolinks = new HashMap<MultiProtocolURI, String>();
            this.audiolinks = new HashMap<MultiProtocolURI, String>();
            this.applinks   = new HashMap<MultiProtocolURI, String>();
            this.emaillinks = new HashMap<String, String>();
            final Map<MultiProtocolURI, ImageEntry> collectedImages = new HashMap<MultiProtocolURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
            for (final Map.Entry<MultiProtocolURI, ImageEntry> entry: collectedImages.entrySet()) {
                if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image");
            }
            for (final Map.Entry<MultiProtocolURI, Properties> entry: this.anchors.entrySet()) {
                url = entry.getKey();
                if (url == null) continue;
                final boolean noindex = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("noindex") >= 0;
                final boolean nofollow = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("nofollow") >= 0;
                if ((thishost == null && url.getHost() == null) ||
                    ((thishost != null && url.getHost() != null) &&
                     (url.getHost().endsWith(thishost) ||
                      (thishost.startsWith("www.") && url.getHost().endsWith(thishost.substring(4)))))) {
                    this.inboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : ""));
                } else {
                    this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : ""));
                }
                u = url.toNormalform(true, false);
                final String name = entry.getValue().getProperty("name", "");
                if (u.startsWith("mailto:")) {
                    this.emaillinks.put(u.substring(7), name);
                } else {
                    extpos = u.lastIndexOf('.');
View Full Code Here

Examples of net.yacy.cora.document.MultiProtocolURI

        // links is either a Set of Strings (urls) or a Set of
        // htmlFilterImageEntries
        final Set<String> h = new HashSet<String>();
        Iterator<?> i = links.iterator();
        Object o;
        MultiProtocolURI url;
        String u;
        int pos;
        int l;
        while (i.hasNext())
            try {
                o = i.next();
                if (o instanceof MultiProtocolURI) url = (MultiProtocolURI) o;
                else if (o instanceof String) url = new MultiProtocolURI((String) o);
                else if (o instanceof ImageEntry) url = ((ImageEntry) o).url();
                else {
                    assert false;
                    continue;
                }
                u = url.toNormalform(true, true);
                if (u.endsWith("/"))
                    u = u.substring(0, u.length() - 1);
                pos = u.lastIndexOf('/');
                while (pos > 8) {
                    l = u.length();
                    u = u.substring(0, pos + 1);
                    h.add(u);
                    u = u.substring(0, pos);
                    assert (u.length() < l) : "u = " + u;
                    pos = u.lastIndexOf('/');
                }
            } catch (final MalformedURLException e) { }
        // now convert the strings to yacyURLs
        i = h.iterator();
        final Map<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
        while (i.hasNext()) {
            u = (String) i.next();
            try {
                url = new MultiProtocolURI(u);
                v.put(url, "sub");
            } catch (final MalformedURLException e) {
            }
        }
        return v;
View Full Code Here

Examples of net.yacy.cora.document.MultiProtocolURI

        // htmlFilterImageEntries
        // we find all links that are part of a reference inside a url
        final Map<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
        final Iterator<?> i = links.iterator();
        Object o;
        MultiProtocolURI url = null;
        String u;
        int pos;
        loop: while (i.hasNext())
            try {
                o = i.next();
                if (o instanceof MultiProtocolURI)
                    url = (MultiProtocolURI) o;
                else if (o instanceof String)
                    url = new MultiProtocolURI((String) o);
                else if (o instanceof ImageEntry)
                    url = ((ImageEntry) o).url();
                else {
                    assert false;
                    continue loop;
                }
                if (url == null) continue loop;
                u = url.toNormalform(true, true);
                if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) {
                    i.remove();
                    u = u.substring(pos);
                    while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0)
                        u = u.substring(pos);
                    url = new MultiProtocolURI(u);
                    if (!(v.containsKey(url)))
                        v.put(url, "ref");
                    continue loop;
                }
                if ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) {
                    i.remove();
                    u = "http:/" + u.substring(pos);
                    while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0)
                        u = "http:/" + u.substring(pos);
                    url = new MultiProtocolURI(u);
                    if (!(v.containsKey(url)))
                        v.put(url, "ref");
                    continue loop;
                }
            } catch (final MalformedURLException e) {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.