Package org.apache.nutch.scoring.webgraph.Loops

Examples of org.apache.nutch.scoring.webgraph.Loops.LoopSet


    loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
      Loops.LOOPS_DIR), conf);

    // get the loopset for a given url, if any
    Text key = new Text(url);
    LoopSet loop = new LoopSet();
    MapFileOutputFormat.getEntry(loopReaders,
      new HashPartitioner<Text, LoopSet>(), key, loop);

    // print out each loop url in the set
    System.out.println(url + ":");
    for (String loopUrl : loop.getLoopSet()) {
      System.out.println("  " + loopUrl);
    }

    // close the readers
    FSUtils.closeReaders(loopReaders);
View Full Code Here


      throws IOException {

      String fromUrl = key.toString();
      List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
      Node node = null;
      LoopSet loops = null;

      // loop through all values aggregating outlinks, saving node and loopset
      while (values.hasNext()) {
        ObjectWritable write = values.next();
        Object obj = write.get();
        if (obj instanceof Node) {
          node = (Node)obj;
        }
        else if (obj instanceof LinkDatum) {
          outlinks.add(WritableUtils.clone((LinkDatum)obj, conf));
        }
        else if (obj instanceof LoopSet) {
          loops = (LoopSet)obj;
        }
      }

      // only collect if there are outlinks
      int numOutlinks = node.getNumOutlinks();
      if (numOutlinks > 0) {

        Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
        for (int i = 0; i < outlinks.size(); i++) {
          LinkDatum outlink = outlinks.get(i);
          String toUrl = outlink.getUrl();
         
          // remove any url that is in the loopset, same as LinkRank
View Full Code Here

      throws IOException {

      String fromUrl = key.toString();
      List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
      Node node = null;
      LoopSet loops = null;

      // aggregate outlinks, assign other values
      while (values.hasNext()) {
        ObjectWritable write = values.next();
        Object obj = write.get();
        if (obj instanceof Node) {
          node = (Node)obj;
        }
        else if (obj instanceof LinkDatum) {
          outlinks.add(WritableUtils.clone((LinkDatum)obj, conf));
        }
        else if (obj instanceof LoopSet) {
          loops = (LoopSet)obj;
        }
      }

      // Check for the possibility of a LoopSet object without Node and LinkDatum objects. This can happen
      // with webgraphs that receive deletes (e.g. link.delete.gone and/or URL filters or normalizers) but
      // without an updated Loops database.
      // See: https://issues.apache.org/jira/browse/NUTCH-1299
      if (node == null && loops != null) {
        // Nothing to do
        LOG.warn("LoopSet without Node object received for " + key.toString() + " . You should either not use Loops as input of the LinkRank program or rerun the Loops program over the WebGraph.");
        return;
      }

      // get the number of outlinks and the current inlink and outlink scores
      // from the node of the url
      int numOutlinks = node.getNumOutlinks();
      float inlinkScore = node.getInlinkScore();
      float outlinkScore = node.getOutlinkScore();
      LOG.debug(fromUrl + ": num outlinks " + numOutlinks);

      // can't invert if no outlinks
      if (numOutlinks > 0) {

        Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
        for (int i = 0; i < outlinks.size(); i++) {
          LinkDatum outlink = outlinks.get(i);
          String toUrl = outlink.getUrl();

          // remove any url that is contained in the loopset
View Full Code Here

    loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
      Loops.LOOPS_DIR), getConf());

    // get the loopset for a given url, if any
    Text key = new Text(url);
    LoopSet loop = new LoopSet();
    MapFileOutputFormat.getEntry(loopReaders,
      new HashPartitioner<Text, LoopSet>(), key, loop);

    // print out each loop url in the set
    System.out.println(url + ":");
    for (String loopUrl : loop.getLoopSet()) {
      System.out.println("  " + loopUrl);
    }

    // close the readers
    FSUtils.closeReaders(loopReaders);
View Full Code Here

      throws IOException {

      String fromUrl = key.toString();
      List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
      Node node = null;
      LoopSet loops = null;

      // aggregate outlinks, assign other values
      while (values.hasNext()) {
        ObjectWritable write = values.next();
        Object obj = write.get();
        if (obj instanceof Node) {
          node = (Node)obj;
        }
        else if (obj instanceof LinkDatum) {
          outlinks.add((LinkDatum)WritableUtils.clone((LinkDatum)obj, conf));
        }
        else if (obj instanceof LoopSet) {
          loops = (LoopSet)obj;
        }
      }

      // get the number of outlinks and the current inlink and outlink scores
      // from the node of the url
      int numOutlinks = node.getNumOutlinks();
      float inlinkScore = node.getInlinkScore();
      float outlinkScore = node.getOutlinkScore();
      LOG.debug(fromUrl + ": num outlinks " + numOutlinks);

      // can't invert if no outlinks
      if (numOutlinks > 0) {

        Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
        for (int i = 0; i < outlinks.size(); i++) {
          LinkDatum outlink = outlinks.get(i);
          String toUrl = outlink.getUrl();

          // remove any url that is contained in the loopset
View Full Code Here

      throws IOException {

      String fromUrl = key.toString();
      List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
      Node node = null;
      LoopSet loops = null;

      // loop through all values aggregating outlinks, saving node and loopset
      while (values.hasNext()) {
        ObjectWritable write = values.next();
        Object obj = write.get();
        if (obj instanceof Node) {
          node = (Node)obj;
        }
        else if (obj instanceof LinkDatum) {
          outlinks.add((LinkDatum)WritableUtils.clone((LinkDatum)obj, conf));
        }
        else if (obj instanceof LoopSet) {
          loops = (LoopSet)obj;
        }
      }

      // only collect if there are outlinks
      int numOutlinks = node.getNumOutlinks();
      if (numOutlinks > 0) {

        Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
        for (int i = 0; i < outlinks.size(); i++) {
          LinkDatum outlink = outlinks.get(i);
          String toUrl = outlink.getUrl();
         
          // remove any url that is in the loopset, same as LinkRank
View Full Code Here

      throws IOException {

      String fromUrl = key.toString();
      List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
      Node node = null;
      LoopSet loops = null;

      // aggregate outlinks, assign other values
      while (values.hasNext()) {
        ObjectWritable write = values.next();
        Object obj = write.get();
        if (obj instanceof Node) {
          node = (Node)obj;
        }
        else if (obj instanceof LinkDatum) {
          outlinks.add((LinkDatum)WritableUtils.clone((LinkDatum)obj, conf));
        }
        else if (obj instanceof LoopSet) {
          loops = (LoopSet)obj;
        }
      }

      // get the number of outlinks and the current inlink and outlink scores
      // from the node of the url
      int numOutlinks = node.getNumOutlinks();
      float inlinkScore = node.getInlinkScore();
      float outlinkScore = node.getOutlinkScore();
      LOG.debug(fromUrl + ": num outlinks " + numOutlinks);

      // can't invert if no outlinks
      if (numOutlinks > 0) {

        Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
        for (int i = 0; i < outlinks.size(); i++) {
          LinkDatum outlink = outlinks.get(i);
          String toUrl = outlink.getUrl();

          // remove any url that is contained in the loopset
View Full Code Here

      throws IOException {

      String fromUrl = key.toString();
      List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
      Node node = null;
      LoopSet loops = null;

      // loop through all values aggregating outlinks, saving node and loopset
      while (values.hasNext()) {
        ObjectWritable write = values.next();
        Object obj = write.get();
        if (obj instanceof Node) {
          node = (Node)obj;
        }
        else if (obj instanceof LinkDatum) {
          outlinks.add((LinkDatum)WritableUtils.clone((LinkDatum)obj, conf));
        }
        else if (obj instanceof LoopSet) {
          loops = (LoopSet)obj;
        }
      }

      // only collect if there are outlinks
      int numOutlinks = node.getNumOutlinks();
      if (numOutlinks > 0) {

        Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
        for (int i = 0; i < outlinks.size(); i++) {
          LinkDatum outlink = outlinks.get(i);
          String toUrl = outlink.getUrl();
         
          // remove any url that is in the loopset, same as LinkRank
View Full Code Here

      throws IOException {

      String fromUrl = key.toString();
      List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
      Node node = null;
      LoopSet loops = null;

      // aggregate outlinks, assign other values
      while (values.hasNext()) {
        ObjectWritable write = values.next();
        Object obj = write.get();
        if (obj instanceof Node) {
          node = (Node)obj;
        }
        else if (obj instanceof LinkDatum) {
          outlinks.add((LinkDatum)WritableUtils.clone((LinkDatum)obj, conf));
        }
        else if (obj instanceof LoopSet) {
          loops = (LoopSet)obj;
        }
      }

      // get the number of outlinks and the current inlink and outlink scores
      // from the node of the url
      int numOutlinks = node.getNumOutlinks();
      float inlinkScore = node.getInlinkScore();
      float outlinkScore = node.getOutlinkScore();
      LOG.debug(fromUrl + ": num outlinks " + numOutlinks);

      // can't invert if no outlinks
      if (numOutlinks > 0) {

        Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
        for (int i = 0; i < outlinks.size(); i++) {
          LinkDatum outlink = outlinks.get(i);
          String toUrl = outlink.getUrl();

          // remove any url that is contained in the loopset
View Full Code Here

    loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
      Loops.LOOPS_DIR), getConf());

    // get the loopset for a given url, if any
    Text key = new Text(url);
    LoopSet loop = new LoopSet();
    MapFileOutputFormat.getEntry(loopReaders,
      new HashPartitioner<Text, LoopSet>(), key, loop);

    // print out each loop url in the set
    System.out.println(url + ":");
    for (String loopUrl : loop.getLoopSet()) {
      System.out.println("  " + loopUrl);
    }

    // close the readers
    FSUtils.closeReaders(loopReaders);
View Full Code Here

TOP

Related Classes of org.apache.nutch.scoring.webgraph.Loops.LoopSet

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.