Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.Outlink


            Parse parse = new ParseUtil(this.conf).parse(content);
            ParseData theParseData = parse.getData();
            Outlink[] theOutlinks = theParseData.getOutlinks();
           
            for(int count = 0; count < theOutlinks.length; count++) {
              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor(), this.conf));
            }
           
            resultText += entry.getName() + " " + parse.getText() + " ";
          } catch (ParseException e) {
            if (LOG.isInfoEnabled()) {
View Full Code Here


      setAlbum(value);
    if (name.equals("TPE1-Text"))
      setArtist(value);

    if (name.indexOf("URL Link") > -1) {
      links.add(new Outlink(value, "", this.conf));
    } else if (name.indexOf("Text") > -1) {
      text += value + "\n";
    }

    metadata.set(name, value);
View Full Code Here

                if (r.getLink() != null) {
                    try {
                        // get the outlink
      if (r.getDescription()!= null ) {
          theOutlinks.add(new Outlink(r.getLink(), r.getDescription(), getConf()));
      } else {
          theOutlinks.add(new Outlink(r.getLink(), "", getConf()));
      }
                    } catch (MalformedURLException e) {
                        if (LOG.isWarnEnabled()) {
                          LOG.warn("MalformedURL: " + r.getLink());
                          LOG.warn("Attempting to continue processing outlinks");
                          e.printStackTrace(LogUtil.getWarnStream(LOG));
                        }
                        continue;
                    }
                }

                // now get the descriptions of all the underlying RSS Items and
                // then index them too
                for (int j = 0; j < r.getItems().size(); j++) {
                    RSSItem theRSSItem = (RSSItem) r.getItems().get(j);
                    indexText.append(theRSSItem.getDescription());
                    indexText.append(" ");

                    String whichLink = null;

                    if (theRSSItem.getPermalink() != null)
                        whichLink = theRSSItem.getPermalink();
                    else
                        whichLink = theRSSItem.getLink();

                    if (whichLink != null) {
                        try {
          if (theRSSItem.getDescription()!=null) {
        theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription(), getConf()));
          } else {
        theOutlinks.add(new Outlink(whichLink, "", getConf()));
          }
                        } catch (MalformedURLException e) {
                            if (LOG.isWarnEnabled()) {
                              LOG.warn("MalformedURL: " + whichLink);
                              LOG.warn("Attempting to continue processing outlinks");
View Full Code Here

        }
        url = url.replaceAll("&amp;", "&");
        if (LOG.isTraceEnabled()) {
          LOG.trace(" - outlink from JS: '" + url + "'");
        }
        outlinks.add(new Outlink(url, anchor, getConf()));
      }
    } catch (Exception ex) {
      // if it is a malformed URL we just throw it away and continue with
      // extraction.
      if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", ex); }
View Full Code Here

      testDOMs[i]= node;
    }
    try {
     answerOutlinks = new Outlink[][]{
         {
           new Outlink("http://www.nutch.org", "anchor", conf),
         },
         {
           new Outlink("http://www.nutch.org/", "home", conf),
           new Outlink("http://www.nutch.org/docs/bot.html", "bots", conf),
         },
         {
           new Outlink("http://www.nutch.org/", "separate this", conf),
           new Outlink("http://www.nutch.org/docs/ok", "from this", conf),
         },
         {
           new Outlink("http://www.nutch.org/", "home", conf),
           new Outlink("http://www.nutch.org/docs/1", "1", conf),
           new Outlink("http://www.nutch.org/docs/2", "2", conf),
         },
         {
           new Outlink("http://www.nutch.org/frames/top.html", "", conf),
           new Outlink("http://www.nutch.org/frames/left.html", "", conf),
           new Outlink("http://www.nutch.org/frames/invalid.html", "", conf),
           new Outlink("http://www.nutch.org/frames/right.html", "", conf),
         },
         {
           new Outlink("http://www.nutch.org/maps/logo.gif", "", conf),
           new Outlink("http://www.nutch.org/index.html", "", conf),
           new Outlink("http://www.nutch.org/maps/#bottom", "", conf),
           new Outlink("http://www.nutch.org/bot.html", "", conf),
           new Outlink("http://www.nutch.org/docs/index.html", "", conf),
         },
         {
             new Outlink("http://www.nutch.org/index.html", "whitespace test", conf),
         },
         {
         },
         {
           new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf),
         },
         {
         },
         {
           new Outlink("http://www.nutch.org/;x", "anchor1", conf),
           new Outlink("http://www.nutch.org/g;x", "anchor2", conf),
           new Outlink("http://www.nutch.org/g;x?y#s", "anchor3", conf)
         },
         {
           new Outlink("http://www.nutch.org/g;something", "anchor1", conf),
           new Outlink("http://www.nutch.org/g;something?y#s", "anchor2", conf),
           new Outlink("http://www.nutch.org/;something?y=1", "anchor3", conf),
           new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4", conf),
           new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5", conf)
         }
      };
  
    } catch (MalformedURLException e) {
       
View Full Code Here

          if (target != null && !noFollow && !post)
            try {
             
              URL url = (base.toString().indexOf(';') > 0) ?
                fixEmbeddedParams(base, target) new URL(base, target);
              outlinks.add(new Outlink(url.toString(),
                                       linkText.toString().trim(), conf));
            } catch (MalformedURLException e) {
              // don't care
            }
        }
View Full Code Here

                if (r.getLink() != null) {
                    try {
                        // get the outlink
      if (r.getDescription()!= null ) {
          theOutlinks.add(new Outlink(r.getLink(), r.getDescription()));
      } else {
          theOutlinks.add(new Outlink(r.getLink(), ""));
      }
                    } catch (MalformedURLException e) {
                        if (LOG.isWarnEnabled()) {
                          LOG.warn("MalformedURL: " + r.getLink());
                          LOG.warn("Attempting to continue processing outlinks");
                          e.printStackTrace(LogUtil.getWarnStream(LOG));
                        }
                        continue;
                    }
                }

                // now get the descriptions of all the underlying RSS Items and
                // then index them too
                for (int j = 0; j < r.getItems().size(); j++) {
                    RSSItem theRSSItem = (RSSItem) r.getItems().get(j);
                    indexText.append(theRSSItem.getDescription());
                    indexText.append(" ");

                    String whichLink = null;

                    if (theRSSItem.getPermalink() != null)
                        whichLink = theRSSItem.getPermalink();
                    else
                        whichLink = theRSSItem.getLink();

                    if (whichLink != null) {
                        try {
          if (theRSSItem.getDescription()!=null) {
        theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription()));
          } else {
        theOutlinks.add(new Outlink(whichLink, ""));
          }
                        } catch (MalformedURLException e) {
                            if (LOG.isWarnEnabled()) {
                              LOG.warn("MalformedURL: " + whichLink);
                              LOG.warn("Attempting to continue processing outlinks");
View Full Code Here

        if (url.startsWith("www.")) {
            url = "http://" + url;
        } else url = new URL(baseURL, url).toString();
        url = url.replaceAll("&amp;", "&");
        LOG.fine(" - outlink from JS: '" + url + "'");
        outlinks.add(new Outlink(url, anchor));
      }
    } catch (Exception ex) {
      // if it is a malformed URL we just throw it away and continue with
      // extraction.
      LOG.throwing(JSParseFilter.class.getName(), "getJSLinks", ex);
View Full Code Here

                if (r.getLink() != null) {
                    try {
                        // get the outlink
      if (r.getDescription()!= null ) {
          theOutlinks.add(new Outlink(r.getLink(), r.getDescription()));
      } else {
          theOutlinks.add(new Outlink(r.getLink(), ""));
      }
                    } catch (MalformedURLException e) {
                        LOG.info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
                                        + r.getLink()
                                        + ": Attempting to continue processing outlinks");
                        e.printStackTrace();
                        continue;
                    }
                }

                // now get the descriptions of all the underlying RSS Items and
                // then index them too
                for (int j = 0; j < r.getItems().size(); j++) {
                    RSSItem theRSSItem = (RSSItem) r.getItems().get(j);
                    indexText.append(theRSSItem.getDescription());
                    indexText.append(" ");

                    String whichLink = null;

                    if (theRSSItem.getPermalink() != null)
                        whichLink = theRSSItem.getPermalink();
                    else
                        whichLink = theRSSItem.getLink();

                    if (whichLink != null) {
                        try {
          if (theRSSItem.getDescription()!=null) {
        theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription()));
          } else {
        theOutlinks.add(new Outlink(whichLink, ""));
          }
                        } catch (MalformedURLException e) {
                            LOG.info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
                                            + whichLink
                                            + ": Attempting to continue processing outlinks");
View Full Code Here

      setAlbum(value);
    if (name.equals("TPE1-Text"))
      setArtist(value);

    if (name.indexOf("URL Link") > -1) {
      links.add(new Outlink(value, ""));
    } else if (name.indexOf("Text") > -1) {
      text += value + "\n";
    }

    metadata.setProperty(name, value);
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.Outlink

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.