Package datafu.pig.linkanalysis

Source Code of datafu.pig.linkanalysis.PageRank

/*
* Copyright 2010 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package datafu.pig.linkanalysis;

import it.unimi.dsi.fastutil.ints.Int2IntMap;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.pig.Accumulator;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;

import datafu.linkanalysis.PageRank.ProgressIndicator;


/**
* A UDF which implements {@link <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>}. 
* Each graph is stored in memory while running the algorithm, with edges optionally
* spilled to disk to conserve memory.  This can be used to distribute the execution of PageRank on a large number of
* reasonable sized graphs.  It does not distribute execuion of PageRank on a single graph.  Each graph is identified
* by an integer valued topic ID.
* <p>
* Example:
* <pre>
* {@code
*
* topic_edges = LOAD 'input_edges' as (topic:INT,source:INT,dest:INT,weight:DOUBLE);
*
* topic_edges_grouped = GROUP topic_edges by (topic, source) ;
* topic_edges_grouped = FOREACH topic_edges_grouped GENERATE
*    group.topic as topic,
*    group.source as source,
*    topic_edges.(dest,weight) as edges;
*
* topic_edges_grouped_by_topic = GROUP topic_edges_grouped BY topic;
*
* topic_ranks = FOREACH topic_edges_grouped_by_topic GENERATE
*    group as topic,
*    FLATTEN(PageRank(topic_edges_grouped.(source,edges))) as (source,rank);
*
* skill_ranks = FOREACH skill_ranks GENERATE
*    topic, source, rank;
*
* }
* </pre>
*/
public class PageRank extends EvalFunc<DataBag> implements Accumulator<DataBag>
{
  private final datafu.linkanalysis.PageRank graph = new datafu.linkanalysis.PageRank();

  private int maxNodesAndEdges = 100000000;
  private int maxEdgesInMemory = 30000000;
  private double tolerance = 1e-16;
  private int maxIters = 150;
  private boolean useEdgeDiskStorage = false;
  private boolean enableDanglingNodeHandling = false;
  private boolean aborted = false;

  TupleFactory tupleFactory = TupleFactory.getInstance();
  BagFactory bagFactory = BagFactory.getInstance();
 
  public PageRank()
  {
    initialize();
  }

  public PageRank(String... parameters)
  {
    if (parameters.length % 2 != 0)
    {
      throw new RuntimeException("Invalid parameters list");
    }

    for (int i=0; i<parameters.length; i+=2)
    {
      String parameterName = parameters[i];
      String value = parameters[i+1];
      if (parameterName.equals("max_nodes_and_edges"))
      {
        maxNodesAndEdges = Integer.parseInt(value);
      }
      else if (parameterName.equals("max_edges_in_memory"))
      {
        maxEdgesInMemory = Integer.parseInt(value);
      }
      else if (parameterName.equals("tolerance"))
      {
        tolerance = Double.parseDouble(value);
      }
      else if (parameterName.equals("max_iters"))
      {
        maxIters = Integer.parseInt(value);
      }
      else if (parameterName.equals("spill_to_edge_disk_storage"))
      {
        useEdgeDiskStorage = Boolean.parseBoolean(value);
      }
      else if (parameterName.equals("dangling_nodes"))
      {
        enableDanglingNodeHandling = Boolean.parseBoolean(value);
      }
    }

    initialize();
  }

  private void initialize()
  {
    long heapSize = Runtime.getRuntime().totalMemory();
    long heapMaxSize = Runtime.getRuntime().maxMemory();
    long heapFreeSize = Runtime.getRuntime().freeMemory();
//    System.out.println(String.format("Heap size: %d, Max heap size: %d, Heap free size: %d", heapSize, heapMaxSize, heapFreeSize));

    if (useEdgeDiskStorage)
    {
      this.graph.enableEdgeDiskCaching();
    }
    else
    {
      this.graph.disableEdgeDiskCaching();
    }

    if (enableDanglingNodeHandling)
    {
      this.graph.enableDanglingNodeHandling();
    }
    else
    {
      this.graph.disableDanglingNodeHandling();
    }

    this.graph.setEdgeCachingThreshold(maxEdgesInMemory);
  }

  @Override
  public void accumulate(Tuple t) throws IOException
  {
    if (aborted)
    {
      return;
    }
   
    DataBag bag = (DataBag) t.get(0);
    if (bag == null || bag.size() == 0)
      return;
   
    for (Tuple sourceTuple : bag)
    {
      Integer sourceId = (Integer)sourceTuple.get(0);
      DataBag edges = (DataBag)sourceTuple.get(1);

      ArrayList<Map<String,Object>> edgesMapList = new ArrayList<Map<String, Object>>();

      for (Tuple edgeTuple : edges)
      {
        Integer destId = (Integer)edgeTuple.get(0);
        Double weight = (Double)edgeTuple.get(1);
        HashMap<String,Object> edgeMap = new HashMap<String, Object>();
        edgeMap.put("dest",destId);
        edgeMap.put("weight",weight);
        edgesMapList.add(edgeMap);
      }

      graph.addEdges(sourceId, edgesMapList);

      if (graph.nodeCount() + graph.edgeCount() > maxNodesAndEdges)
      {
        System.out.println(String.format("There are too many nodes and edges (%d + %d > %d). Aborting.", graph.nodeCount(), graph.edgeCount(), maxNodesAndEdges));
        aborted = true;
      }

      reporter.progress();
    }
  }

  @Override
  public DataBag getValue()
  {
    if (aborted)
    {
      return null;
    }
   
    System.out.println(String.format("Nodes: %d, Edges: %d", graph.nodeCount(), graph.edgeCount()));
   
    ProgressIndicator progressIndicator = getProgressIndicator();
    System.out.println("Finished loading graph.");
    long startTime = System.nanoTime();
    System.out.println("Initializing.");
    try
    {
      graph.init(progressIndicator);
    }
    catch (IOException e)
    {
      e.printStackTrace();
      return null;
    }
    System.out.println(String.format("Done, took %f ms", (System.nanoTime() - startTime)/10.0e6));

    float totalDiff;
    int iter = 0;

    System.out.println("Beginning iterations");
    startTime = System.nanoTime();
    do
    {
      // TODO log percentage complete every 5 minutes
      try
      {
        totalDiff = graph.nextIteration(progressIndicator);
      }
      catch (IOException e)
      {
        e.printStackTrace();
        return null;
      }
      iter++;
    } while(iter < maxIters && totalDiff > tolerance);
    System.out.println(String.format("Done, %d iterations took %f ms", iter, (System.nanoTime() - startTime)/10.0e6));

    DataBag output = bagFactory.newDefaultBag();

    for (Int2IntMap.Entry node : graph.getNodeIds())
    {
      int nodeId = node.getIntKey();
      float rank = graph.getNodeRank(nodeId);
      List nodeData = new ArrayList(2);
      nodeData.add(nodeId);
      nodeData.add(rank);
      output.add(tupleFactory.newTuple(nodeData));
    }

    return output;
  }

  @Override
  public void cleanup()
  {
    try
    {
      aborted = false;
      this.graph.clear();
    }
    catch (IOException e)
    {
      e.printStackTrace();
    }
  }

  @Override
  public DataBag exec(Tuple input) throws IOException
  {
    try
    {
      accumulate(input);
     
      return getValue();
    }
    finally
    {
      cleanup();
    }
  }

  private ProgressIndicator getProgressIndicator()
  {
    return new ProgressIndicator()
        {
          @Override
          public void progress()
          {
            reporter.progress();
          }
        };
  }

  @Override
  public Schema outputSchema(Schema input)
  {
    try
    {
      Schema.FieldSchema inputFieldSchema = input.getField(0);

      if (inputFieldSchema.type != DataType.BAG)
      {
        throw new RuntimeException("Expected a BAG as input");
      }

      Schema inputBagSchema = inputFieldSchema.schema;

      if (inputBagSchema.getField(0).type != DataType.TUPLE)
      {
        throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s",
                                                 DataType.findTypeName(inputBagSchema.getField(0).type)));
      }
     
      Schema inputTupleSchema = inputBagSchema.getField(0).schema;
     
      if (inputTupleSchema.getField(0).type != DataType.INTEGER)
      {
        throw new RuntimeException(String.format("Expected source to be an INTEGER, but instead found %s",
                                                 DataType.findTypeName(inputTupleSchema.getField(0).type)));
      }

      if (inputTupleSchema.getField(1).type != DataType.BAG)
      {
        throw new RuntimeException(String.format("Expected edges to be represented with a BAG"));
      }

      Schema.FieldSchema edgesFieldSchema = inputTupleSchema.getField(1);

      if (edgesFieldSchema.schema.getField(0).type != DataType.TUPLE)
      {
        throw new RuntimeException(String.format("Expected edges field to contain a TUPLE, but instead found %s",
                                                 DataType.findTypeName(edgesFieldSchema.schema.getField(0).type)));
      }
     
      Schema edgesTupleSchema = edgesFieldSchema.schema.getField(0).schema;
     
      if (edgesTupleSchema.getField(0).type != DataType.INTEGER)
      {
        throw new RuntimeException(String.format("Expected destination edge ID to an INTEGER, but instead found %s",
                                                 DataType.findTypeName(edgesFieldSchema.schema.getField(0).type)));
      }

      if (edgesTupleSchema.getField(1).type != DataType.DOUBLE)
      {
        throw new RuntimeException(String.format("Expected destination edge weight to a DOUBLE, but instead found %s",
                                                 DataType.findTypeName(edgesFieldSchema.schema.getField(1).type)));
      }

      Schema tupleSchema = new Schema();
      tupleSchema.add(new Schema.FieldSchema("node",DataType.INTEGER));
      tupleSchema.add(new Schema.FieldSchema("rank",DataType.FLOAT));

      return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
                                                                 .getName()
                                                                 .toLowerCase(), input),
                                               tupleSchema,
                                               DataType.BAG));
    }
    catch (FrontendException e)
    {
      throw new RuntimeException(e);
    }
  }
}
TOP

Related Classes of datafu.pig.linkanalysis.PageRank

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.