Package org.apache.giraph.hive

Source Code of org.apache.giraph.hive.HiveGiraphRunner

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.giraph.hive;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.giraph.conf.GiraphClasses;
import org.apache.giraph.conf.GiraphConfiguration;
import org.apache.giraph.graph.Vertex;
import org.apache.giraph.hive.input.edge.HiveEdgeInputFormat;
import org.apache.giraph.hive.input.edge.HiveToEdge;
import org.apache.giraph.hive.input.vertex.HiveToVertex;
import org.apache.giraph.hive.input.vertex.HiveVertexInputFormat;
import org.apache.giraph.hive.output.HiveVertexOutputFormat;
import org.apache.giraph.hive.output.HiveVertexWriter;
import org.apache.giraph.hive.output.VertexToHive;
import org.apache.giraph.job.GiraphJob;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.apache.thrift.TException;

import com.facebook.giraph.hive.input.HiveApiInputFormat;
import com.facebook.giraph.hive.input.HiveInputDescription;
import com.facebook.giraph.hive.output.HiveApiOutputFormat;
import com.facebook.giraph.hive.output.HiveOutputDescription;
import com.facebook.giraph.hive.schema.HiveTableSchemas;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import java.io.File;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;

import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_EDGE_SPLITS;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_TO_EDGE_CLASS;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_TO_VERTEX_CLASS;
import static org.apache.giraph.hive.common.GiraphHiveConstants.HIVE_VERTEX_SPLITS;
import static org.apache.giraph.hive.common.HiveProfiles.EDGE_INPUT_PROFILE_ID;
import static org.apache.giraph.hive.common.HiveProfiles.VERTEX_INPUT_PROFILE_ID;
import static org.apache.giraph.hive.common.HiveProfiles.VERTEX_OUTPUT_PROFILE_ID;

/**
* Hive Giraph Runner
*/
public class HiveGiraphRunner implements Tool {
  /** logger */
  private static final Logger LOG = Logger.getLogger(HiveGiraphRunner.class);
  /** Prefix for log statements */
  private static final String LOG_PREFIX = "\t";

  /** workers */
  protected int workers;
  /** is verbose */
  protected boolean isVerbose;

  /** vertex class. */
  private Class<? extends Vertex> vertexClass;

  /** Vertex creator from hive records. */
  private Class<? extends HiveToVertex> hiveToVertexClass;
  /** hive vertex input information */
  private  final HiveInputDescription hiveVertexInputDescription;

  /** Edge creator from hive records. */
  private Class<? extends HiveToEdge> hiveToEdgeClass;
  /** hive edge input information */
  private final HiveInputDescription hiveEdgeInputDescription;

  /** Hive Vertex writer */
  private Class<? extends VertexToHive> vertexToHiveClass;
  /** hive output information */
  private final HiveOutputDescription hiveOutputDescription;
  /** Skip output? (Useful for testing without writing) */
  private boolean skipOutput = false;

  /** Configuration */
  private Configuration conf;

  /** Create a new runner */
  public HiveGiraphRunner() {
    conf = new HiveConf(getClass());
    hiveVertexInputDescription = new HiveInputDescription();
    hiveEdgeInputDescription = new HiveInputDescription();
    hiveOutputDescription = new HiveOutputDescription();
  }

  public Class<? extends Vertex> getVertexClass() {
    return vertexClass;
  }

  public void setVertexClass(Class<? extends Vertex> vertexClass) {
    this.vertexClass = vertexClass;
  }

  public HiveInputDescription getHiveVertexInputDescription() {
    return hiveVertexInputDescription;
  }

  public HiveOutputDescription getHiveOutputDescription() {
    return hiveOutputDescription;
  }

  public HiveInputDescription getHiveEdgeInputDescription() {
    return hiveEdgeInputDescription;
  }

  public Class<? extends HiveToVertex> getHiveToVertexClass() {
    return hiveToVertexClass;
  }

  /**
   * Set HiveToVertex used with HiveVertexInputFormat
   *
   * @param hiveToVertexClass HiveToVertex
   */
  public void setHiveToVertexClass(
      Class<? extends HiveToVertex> hiveToVertexClass) {
    this.hiveToVertexClass = hiveToVertexClass;
    HIVE_TO_VERTEX_CLASS.set(conf, hiveToVertexClass);
  }

  /**
   * Whether to use vertex input.
   *
   * @return true if vertex input enabled (HiveToVertex is set).
   */
  public boolean hasVertexValueInput() {
    return hiveToVertexClass != null;
  }

  public Class<? extends HiveToEdge> getHiveToEdgeClass() {
    return hiveToEdgeClass;
  }

  /**
   * Whether to use edge input.
   *
   * @return true if edge input enabled (HiveToEdge is set).
   */
  public boolean hasEdgeInput() {
    return hiveToEdgeClass != null;
  }

  /**
   * Set HiveToEdge used with HiveEdgeInputFormat
   *
   * @param hiveToEdgeClass HiveToEdge
   */
  public void setHiveToEdgeClass(Class<? extends HiveToEdge> hiveToEdgeClass) {
    this.hiveToEdgeClass = hiveToEdgeClass;
    HIVE_TO_EDGE_CLASS.set(conf, hiveToEdgeClass);
  }

  public Class<? extends VertexToHive> getVertexToHiveClass() {
    return vertexToHiveClass;
  }

  /**
   * Whether we are writing vertices out.
   *
   * @return true if vertex output enabled
   */
  public boolean hasVertexOutput() {
    return !skipOutput && vertexToHiveClass != null;
  }

  /**
   * Set class used to write vertices to Hive.
   *
   * @param vertexToHiveClass class for writing vertices to Hive.
   */
  public void setVertexToHiveClass(
      Class<? extends VertexToHive> vertexToHiveClass) {
    this.vertexToHiveClass = vertexToHiveClass;
    conf.setClass(HiveVertexWriter.VERTEX_TO_HIVE_KEY, vertexToHiveClass,
        VertexToHive.class);
  }

  /**
   * main method
   * @param args system arguments
   * @throws Exception any errors from Hive Giraph Runner
   */
  public static void main(String[] args) throws Exception {
    HiveGiraphRunner runner = new HiveGiraphRunner();
    System.exit(ToolRunner.run(runner, args));
  }

  @Override
  public final int run(String[] args) throws Exception {
    // process args
    try {
      handleCommandLine(args);
    } catch (InterruptedException e) {
      return 0;
    } catch (IllegalArgumentException e) {
      System.err.println(e.getMessage());
      return -1;
    }

    // additional configuration for Hive
    adjustConfigurationForHive();

    // setup GiraphJob
    GiraphJob job = new GiraphJob(getConf(), getClass().getName());
    GiraphConfiguration giraphConf = job.getConfiguration();
    giraphConf.setVertexClass(vertexClass);

    setupHiveInputs(giraphConf);
    setupHiveOutput(giraphConf);

    giraphConf.setWorkerConfiguration(workers, workers, 100.0f);
    initGiraphJob(job);

    logOptions(giraphConf);

    return job.run(isVerbose) ? 0 : -1;
  }

  /**
   * Initialize hive input settings
   *
   * @param conf Configuration to write to
   * @throws TException thrift problem
   */
  private void setupHiveInputs(GiraphConfiguration conf) throws TException {
    if (hiveToVertexClass != null) {
      hiveVertexInputDescription.setNumSplits(HIVE_VERTEX_SPLITS.get(conf));
      HiveApiInputFormat.setProfileInputDesc(conf, hiveVertexInputDescription,
          VERTEX_INPUT_PROFILE_ID);
      conf.setVertexInputFormatClass(HiveVertexInputFormat.class);
      HiveTableSchemas.put(conf, VERTEX_INPUT_PROFILE_ID,
          hiveVertexInputDescription.hiveTableName());
    }

    if (hiveToEdgeClass != null) {
      hiveEdgeInputDescription.setNumSplits(HIVE_EDGE_SPLITS.get(conf));
      HiveApiInputFormat.setProfileInputDesc(conf, hiveEdgeInputDescription,
          EDGE_INPUT_PROFILE_ID);
      conf.setEdgeInputFormatClass(HiveEdgeInputFormat.class);
      HiveTableSchemas.put(conf, EDGE_INPUT_PROFILE_ID,
          hiveEdgeInputDescription.hiveTableName());
    }
  }

  /**
   * Initialize hive output settings
   *
   * @param conf Configuration to write to
   * @throws TException thrift problem
   */
  private void setupHiveOutput(GiraphConfiguration conf) throws TException {
    if (skipOutput) {
      LOG.warn("run: Warning - Output will be skipped!");
    } else if (vertexToHiveClass != null) {
      HiveApiOutputFormat.initProfile(conf, hiveOutputDescription,
          VERTEX_OUTPUT_PROFILE_ID);
      conf.setVertexOutputFormatClass(HiveVertexOutputFormat.class);
      HiveTableSchemas.put(conf, VERTEX_OUTPUT_PROFILE_ID,
          hiveOutputDescription.hiveTableName());
    } else {
      LOG.fatal("output requested but " + VertexToHive.class.getSimpleName() +
          " not set");
    }
  }

  /**
  * set hive configuration
  */
  private void adjustConfigurationForHive() {
    // when output partitions are used, workers register them to the
    // metastore at cleanup stage, and on HiveConf's initialization, it
    // looks for hive-site.xml.
    addToStringCollection(conf, "tmpfiles", conf.getClassLoader()
        .getResource("hive-site.xml").toString());

    // Or, more effectively, we can provide all the jars client needed to
    // the workers as well
    String[] hadoopJars = System.getenv("HADOOP_CLASSPATH").split(
        File.pathSeparator);
    List<String> hadoopJarURLs = Lists.newArrayList();
    for (String jarPath : hadoopJars) {
      File file = new File(jarPath);
      if (file.exists() && file.isFile()) {
        String jarURL = file.toURI().toString();
        hadoopJarURLs.add(jarURL);
      }
    }
    addToStringCollection(conf, "tmpjars", hadoopJarURLs);
  }

  /**
  * process arguments
  * @param args to process
  * @return CommandLine instance
  * @throws org.apache.commons.cli.ParseException error parsing arguments
  * @throws InterruptedException interrupted
  */
  private CommandLine handleCommandLine(String[] args) throws ParseException,
            InterruptedException {
    Options options = new Options();
    addOptions(options);
    addMoreOptions(options);

    CommandLineParser parser = new GnuParser();
    final CommandLine cmdln = parser.parse(options, args);
    if (args.length == 0 || cmdln.hasOption("help")) {
      new HelpFormatter().printHelp(getClass().getName(), options, true);
      throw new InterruptedException();
    }

    // Giraph classes
    String vertexClassStr = cmdln.getOptionValue("vertexClass");
    if (vertexClassStr != null) {
      vertexClass = findClass(vertexClassStr, Vertex.class);
    }
    if (vertexClass == null) {
      throw new IllegalArgumentException(
          "Need the Giraph " + Vertex.class.getSimpleName() +
              " class name (-vertexClass) to use");
    }

    String hiveToVertexClassStr = cmdln.getOptionValue("hiveToVertexClass");
    if (hiveToVertexClassStr != null) {
      if (hiveToVertexClassStr.equals("disable")) {
        hiveToVertexClass = null;
      } else {
        setHiveToVertexClass(
            findClass(hiveToVertexClassStr, HiveToVertex.class));
      }
    }

    String hiveToEdgeClassStr = cmdln.getOptionValue("hiveToEdgeClass");
    if (hiveToEdgeClassStr != null) {
      if (hiveToEdgeClassStr.equals("disable")) {
        hiveToEdgeClass = null;
      } else {
        setHiveToEdgeClass(
            findClass(hiveToEdgeClassStr, HiveToEdge.class));
      }
    }

    String vertexToHiveClassStr = cmdln.getOptionValue("vertexToHiveClass");
    if (vertexToHiveClassStr != null) {
      setVertexToHiveClass(findClass(vertexToHiveClassStr, VertexToHive.class));
    }

    if (cmdln.hasOption("skipOutput")) {
      skipOutput = true;
    }

    if (hiveToVertexClass == null && hiveToEdgeClass == null) {
      throw new IllegalArgumentException(
          "Need at least one of Giraph " +
          HiveToVertex.class.getSimpleName() +
          " class name (-hiveToVertexClass) and " +
          HiveToEdge.class.getSimpleName() +
          " class name (-hiveToEdgeClass)");
    }
    if (vertexToHiveClass == null && !skipOutput) {
      throw new IllegalArgumentException(
          "Need the Giraph " + VertexToHive.class.getSimpleName() +
          " class name (-vertexToHiveClass) to use");
    }
    String workersStr = cmdln.getOptionValue("workers");
    if (workersStr == null) {
      throw new IllegalArgumentException(
          "Need to choose the number of workers (-w)");
    }

    String vertexInputTableStr = cmdln.getOptionValue("vertexInputTable");
    if (vertexInputTableStr == null && hiveToVertexClass != null) {
      throw new IllegalArgumentException(
          "Need to set the vertex input table name (-vi)");
    }

    String edgeInputTableStr = cmdln.getOptionValue("edgeInputTable");
    if (edgeInputTableStr == null && hiveToEdgeClass != null) {
      throw new IllegalArgumentException(
          "Need to set the edge input table name (-ei)");
    }

    String outputTableStr = cmdln.getOptionValue("outputTable");
    if (outputTableStr == null) {
      throw new IllegalArgumentException(
          "Need to set the output table name (-o)");
    }

    String dbName = cmdln.getOptionValue("dbName", "default");
    hiveVertexInputDescription.setDbName(dbName);
    hiveEdgeInputDescription.setDbName(dbName);
    hiveOutputDescription.setDbName(dbName);

    hiveEdgeInputDescription.setPartitionFilter(
        cmdln.getOptionValue("edgeInputFilter"));
    hiveEdgeInputDescription.setTableName(edgeInputTableStr);

    hiveVertexInputDescription.setPartitionFilter(
        cmdln.getOptionValue("vertexInputFilter"));
    hiveVertexInputDescription.setTableName(vertexInputTableStr);

    hiveOutputDescription.setTableName(cmdln.getOptionValue("outputTable"));
    hiveOutputDescription.setPartitionValues(
        parsePartitionValues(cmdln.getOptionValue("outputPartition"))
    );

    workers = Integer.parseInt(workersStr);

    isVerbose = cmdln.hasOption("verbose");

    // pick up -hiveconf arguments
    processHiveConfOptions(cmdln);

    processMoreArguments(cmdln);

    return cmdln;
  }

  /**
   * Process -hiveconf options from command line
   *
   * @param cmdln Command line options
   */
  private void processHiveConfOptions(CommandLine cmdln) {
    for (String hiveconf : cmdln.getOptionValues("hiveconf")) {
      String[] keyval = hiveconf.split("=", 2);
      if (keyval.length == 2) {
        String name = keyval[0];
        String value = keyval[1];
        if (name.equals("tmpjars") || name.equals("tmpfiles")) {
          addToStringCollection(conf, name, value);
        } else {
          conf.set(name, value);
        }
      }
    }
  }

  /**
   * @param outputTablePartitionString table partition string
   * @return Map
   */
  public static Map<String, String> parsePartitionValues(
      String outputTablePartitionString) {
    if (outputTablePartitionString == null) {
      return null;
    }
    Splitter commaSplitter = Splitter.on(',').omitEmptyStrings().trimResults();
    Splitter equalSplitter = Splitter.on('=').omitEmptyStrings().trimResults();
    Map<String, String> partitionValues = Maps.newHashMap();
    for (String keyValStr : commaSplitter.split(outputTablePartitionString)) {
      List<String> keyVal = Lists.newArrayList(equalSplitter.split(keyValStr));
      if (keyVal.size() != 2) {
        throw new IllegalArgumentException(
            "Unrecognized partition value format: " +
            outputTablePartitionString);
      }
      partitionValues.put(keyVal.get(0), keyVal.get(1));
    }
    return partitionValues;
  }

  /**
   * Add hive-related options to command line parser options
   *
   * @param options Options to use
   */
  private void addOptions(Options options) {
    options.addOption("h", "help", false, "Help");
    options.addOption("v", "verbose", false, "Verbose");
    options.addOption("D", "hiveconf", true,
                "property=value for Hive/Hadoop configuration");
    options.addOption("w", "workers", true, "Number of workers");

    if (vertexClass == null) {
      options.addOption(null, "vertexClass", true,
          "Giraph Vertex class to use");
    }

    options.addOption("db", "dbName", true, "Hive database name");

    // Vertex input settings
    options.addOption(null, "hiveToVertexClass", true,
        "Giraph " + HiveToVertex.class.getSimpleName() +
            " class to use (default - " +
            (hiveToVertexClass == null ? "not used" :
                hiveToVertexClass.getSimpleName()) + "), " +
            "\"disable\" will unset this option");
    options.addOption("vi", "vertexInputTable", true,
        "Vertex input table name");
    options.addOption("VI", "vertexInputFilter", true,
        "Vertex input table filter expression (e.g., \"a<2 AND b='two'\"");

    // Edge input settings
    options.addOption(null, "hiveToEdgeClass", true,
        "Giraph " + HiveToEdge.class.getSimpleName() +
            " class to use (default - " +
            (hiveToEdgeClass == null ? "not used" :
                hiveToEdgeClass.getSimpleName()) + "), " +
            "\"disable\" will unset this option");
    options.addOption("ei", "edgeInputTable", true,
        "Edge input table name");
    options.addOption("EI", "edgeInputFilter", true,
        "Edge input table filter expression (e.g., \"a<2 AND b='two'\"");

    // Vertex output settings
    if (vertexToHiveClass == null) {
      options.addOption(null, "vertexToHiveClass", true,
          "Giraph " + VertexToHive.class.getSimpleName() + " class to use");
    }

    options.addOption("o", "outputTable", true, "Output table name");
    options.addOption("O", "outputPartition", true,
        "Output table partition values (e.g., \"a=1,b=two\")");
    options.addOption("s", "skipOutput", false, "Skip output?");
  }

  /**
  * add string to collection
  * @param conf Configuration
  * @param name name to add
  * @param values values for collection
  */
  private static void addToStringCollection(Configuration conf, String name,
                                              String... values) {
    addToStringCollection(conf, name, Arrays.asList(values));
  }

  /**
  * add string to collection
  * @param conf Configuration
  * @param name to add
  * @param values values for collection
  */
  private static void addToStringCollection(
          Configuration conf, String name, Collection
          <? extends String> values) {
    Collection<String> tmpfiles = conf.getStringCollection(name);
    tmpfiles.addAll(values);
    conf.setStrings(name, tmpfiles.toArray(new String[tmpfiles.size()]));
  }

  /**
  *
  * @param className to find
  * @param base  base class
  * @param <T> class type found
  * @return type found
  */
  private <T> Class<? extends T> findClass(String className, Class<T> base) {
    try {
      Class<?> cls = Class.forName(className);
      if (base.isAssignableFrom(cls)) {
        return cls.asSubclass(base);
      }
      return null;
    } catch (ClassNotFoundException e) {
      throw new IllegalArgumentException(className + ": Invalid class name");
    }
  }

  @Override
  public final Configuration getConf() {
    return conf;
  }

  @Override
  public final void setConf(Configuration conf) {
    this.conf = conf;
  }

  /**
  * Override this method to add more command-line options. You can process
  * them by also overriding {@link #processMoreArguments(CommandLine)}.
  *
  * @param options Options
  */
  protected void addMoreOptions(Options options) {
  }

  /**
  * Override this method to process additional command-line arguments. You
  * may want to declare additional options by also overriding
  * {@link #addMoreOptions(org.apache.commons.cli.Options)}.
  *
  * @param cmd Command
  */
  protected void processMoreArguments(CommandLine cmd) {
  }

  /**
  * Override this method to do additional setup with the GiraphJob that will
  * run.
  *
  * @param job GiraphJob that is going to run
  */
  protected void initGiraphJob(GiraphJob job) { }

  /**
   * Log the options set by user
   *
   * @param giraphConf GiraphConfiguration
   */
  private void logOptions(GiraphConfiguration giraphConf) {
    GiraphClasses classes = new GiraphClasses(giraphConf);

    LOG.info(getClass().getSimpleName() + " with");

    LOG.info(LOG_PREFIX + "-vertexClass=" + vertexClass.getCanonicalName());

    if (hiveToVertexClass != null) {
      LOG.info(LOG_PREFIX + "-hiveToVertexClass=" +
          hiveToVertexClass.getCanonicalName());
    }
    if (classes.getVertexInputFormatClass() != null) {
      LOG.info(LOG_PREFIX + "-vertexInputFormatClass=" +
          classes.getVertexInputFormatClass().getCanonicalName());
      logInputDesc(hiveVertexInputDescription, "vertex");
    }

    if (hiveToEdgeClass != null) {
      LOG.info(LOG_PREFIX + "-hiveToEdgeClass=" +
          hiveToEdgeClass.getCanonicalName());
    }
    if (classes.getEdgeInputFormatClass() != null) {
      LOG.info(LOG_PREFIX + "-edgeInputFormatClass=" +
        classes.getEdgeInputFormatClass().getCanonicalName());
      logInputDesc(hiveEdgeInputDescription, "edge");
    }

    LOG.info(LOG_PREFIX + "-outputTable=" +
        hiveOutputDescription.getTableName());
    if (hiveOutputDescription.hasPartitionValues()) {
      LOG.info(LOG_PREFIX + "-outputPartition=\"" +
          hiveOutputDescription.getPartitionValues() + "\"");
    }
    if (classes.getVertexOutputFormatClass() != null) {
      LOG.info(LOG_PREFIX + "-outputFormatClass=" +
          classes.getVertexOutputFormatClass().getCanonicalName());
    }

    LOG.info(LOG_PREFIX + "-workers=" + workers);
  }

  /**
   * Helper to log input description with a name
   *
   * @param inputDesc input description to log
   * @param name String prefix name
   */
  private void logInputDesc(HiveInputDescription inputDesc, String name) {
    if (inputDesc.hasTableName()) {
      LOG.info(
          LOG_PREFIX + "-" + name + "InputTable=" + inputDesc.getTableName());
    }
    if (inputDesc.hasPartitionFilter()) {
      LOG.info(LOG_PREFIX + "-" + name + "InputFilter=\"" +
          inputDesc.getPartitionFilter() + "\"");
    }
  }
}
TOP

Related Classes of org.apache.giraph.hive.HiveGiraphRunner

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.