Package org.apache.nutch.scoring.webgraph

Examples of org.apache.nutch.scoring.webgraph.Node


     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
      throws IOException {

      Node nodeDb = null;
      List<CrawlDatum> fetchDatums = new ArrayList<CrawlDatum>();
      ParseData parseData = null;
      ParseText parseText = null;
      List<FieldWritable> fieldsList = new ArrayList<FieldWritable>();

View Full Code Here


     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, LinkDatum> output, Reporter reporter)
      throws IOException {

      Node node = null;
      List<String> urls = new ArrayList<String>();

      while (values.hasNext()) {
        ObjectWritable objWrite = values.next();
        Object obj = objWrite.get();
        if (obj instanceof Node) {
          node = (Node)obj;
        }
        else if (obj instanceof Text) {
          urls.add(obj.toString());
        }
      }

      if (urls.size() > 0) {
        float score = (node != null) ? node.getInlinkScore() : 0.0f;
        for (String url : urls) {
          LinkDatum datum = new LinkDatum(key.toString());
          datum.setScore(score);
          output.collect(new Text(url), datum);
        }
View Full Code Here

    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, LinkDatum> output, Reporter reporter)
      throws IOException {

      List<LinkDatum> outlinkList = new ArrayList<LinkDatum>();
      Node node = null;

      // collect the outlinks while ignoring links with empty anchor text, also
      // assign the node
      while (values.hasNext()) {
        ObjectWritable objWrite = values.next();
        Object obj = objWrite.get();
        if (obj instanceof LinkDatum) {
          LinkDatum next = (LinkDatum)obj;
          String anchor = next.getAnchor();
          if (anchor != null) {
            anchor = anchor.trim();
          }
          if (ignoreEmptyAnchors && (anchor == null || anchor.length() == 0)) {
            continue;
          }
          outlinkList.add(next);
        }
        else if (obj instanceof Node) {
          node = (Node)obj;
        }
      }

      // has to have outlinks to index
      if (node != null && outlinkList.size() > 0) {
        String fromUrl = key.toString();
        float outlinkScore = node.getInlinkScore();
        for (LinkDatum datum : outlinkList) {
          String toUrl = datum.getUrl();
          datum.setUrl(fromUrl);
          datum.setScore(outlinkScore);
          datum.setLinkType(LinkDatum.INLINK);
View Full Code Here

  public void reduce(Text key, Iterator<CrawlDatum> values,
    OutputCollector<Text, CrawlDatum> output, Reporter reporter)
    throws IOException {

    String url = key.toString();
    Node node = null;
    List<CrawlDatum> datums = new ArrayList<CrawlDatum>();

    // get all crawl datums for a given url key, fetch for instance can have
    // more than one under a given key if there are multiple redirects to a
    // given url
View Full Code Here

     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
      throws IOException {

      Node nodeDb = null;
      List<CrawlDatum> fetchDatums = new ArrayList<CrawlDatum>();
      ParseData parseData = null;
      ParseText parseText = null;
      List<FieldWritable> fieldsList = new ArrayList<FieldWritable>();

View Full Code Here

     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, LinkDatum> output, Reporter reporter)
      throws IOException {

      Node node = null;
      List<String> urls = new ArrayList<String>();

      while (values.hasNext()) {
        ObjectWritable objWrite = values.next();
        Object obj = objWrite.get();
        if (obj instanceof Node) {
          node = (Node)obj;
        }
        else if (obj instanceof Text) {
          urls.add(obj.toString());
        }
      }

      if (urls.size() > 0) {
        float score = (node != null) ? node.getInlinkScore() : 0.0f;
        for (String url : urls) {
          LinkDatum datum = new LinkDatum(key.toString());
          datum.setScore(score);
          output.collect(new Text(url), datum);
        }
View Full Code Here

  public void reduce(Text key, Iterator<CrawlDatum> values,
    OutputCollector<Text, CrawlDatum> output, Reporter reporter)
    throws IOException {

    String url = key.toString();
    Node node = null;
    List<CrawlDatum> datums = new ArrayList<CrawlDatum>();

    // get all crawl datums for a given url key, fetch for instance can have
    // more than one under a given key if there are multiple redirects to a
    // given url
View Full Code Here

    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, LinkDatum> output, Reporter reporter)
      throws IOException {

      List<LinkDatum> outlinkList = new ArrayList<LinkDatum>();
      Node node = null;

      // collect the outlinks while ignoring links with empty anchor text, also
      // assign the node
      while (values.hasNext()) {
        ObjectWritable objWrite = values.next();
        Object obj = objWrite.get();
        if (obj instanceof LinkDatum) {
          LinkDatum next = (LinkDatum)obj;
          String anchor = next.getAnchor();
          if (anchor != null) {
            anchor = anchor.trim();
          }
          if (ignoreEmptyAnchors && (anchor == null || anchor.length() == 0)) {
            continue;
          }
          outlinkList.add(next);
        }
        else if (obj instanceof Node) {
          node = (Node)obj;
        }
      }

      // has to have outlinks to index
      if (node != null && outlinkList.size() > 0) {
        String fromUrl = key.toString();
        float outlinkScore = node.getInlinkScore();
        for (LinkDatum datum : outlinkList) {
          String toUrl = datum.getUrl();
          datum.setUrl(fromUrl);
          datum.setScore(outlinkScore);
          datum.setLinkType(LinkDatum.INLINK);
View Full Code Here

TOP

Related Classes of org.apache.nutch.scoring.webgraph.Node

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.