Package org.apache.spark.api.java

Examples of org.apache.spark.api.java.JavaSparkContext


    }
  }

  public static void main(String[] args) throws Exception {
    SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaSQLContext sqlCtx = new JavaSQLContext(ctx);

    System.out.println("=== Data source: RDD ===");
    // Load a text file and convert each line to a Java Bean.
    JavaRDD<Person> people = ctx.textFile("examples/src/main/resources/people.txt").map(
      new Function<String, Person>() {
        @Override
        public Person call(String line) {
          String[] parts = line.split(",");

          Person person = new Person();
          person.setName(parts[0]);
          person.setAge(Integer.parseInt(parts[1].trim()));

          return person;
        }
      });

    // Apply a schema to an RDD of Java Beans and register it as a table.
    JavaSchemaRDD schemaPeople = sqlCtx.applySchema(people, Person.class);
    schemaPeople.registerTempTable("people");

    // SQL can be run over RDDs that have been registered as tables.
    JavaSchemaRDD teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");

    // The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
    // The columns of a row in the result can be accessed by ordinal.
    List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
      @Override
      public String call(Row row) {
        return "Name: " + row.getString(0);
      }
    }).collect();
    for (String name: teenagerNames) {
      System.out.println(name);
    }

    System.out.println("=== Data source: Parquet File ===");
    // JavaSchemaRDDs can be saved as parquet files, maintaining the schema information.
    schemaPeople.saveAsParquetFile("people.parquet");

    // Read in the parquet file created above.
    // Parquet files are self-describing so the schema is preserved.
    // The result of loading a parquet file is also a JavaSchemaRDD.
    JavaSchemaRDD parquetFile = sqlCtx.parquetFile("people.parquet");

    //Parquet files can also be registered as tables and then used in SQL statements.
    parquetFile.registerTempTable("parquetFile");
    JavaSchemaRDD teenagers2 =
      sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
    teenagerNames = teenagers2.map(new Function<Row, String>() {
      @Override
      public String call(Row row) {
          return "Name: " + row.getString(0);
      }
    }).collect();
    for (String name: teenagerNames) {
      System.out.println(name);
    }

    System.out.println("=== Data source: JSON Dataset ===");
    // A JSON dataset is pointed by path.
    // The path can be either a single text file or a directory storing text files.
    String path = "examples/src/main/resources/people.json";
    // Create a JavaSchemaRDD from the file(s) pointed by path
    JavaSchemaRDD peopleFromJsonFile = sqlCtx.jsonFile(path);

    // Because the schema of a JSON dataset is automatically inferred, to write queries,
    // it is better to take a look at what is the schema.
    peopleFromJsonFile.printSchema();
    // The schema of people is ...
    // root
    //  |-- age: IntegerType
    //  |-- name: StringType

    // Register this JavaSchemaRDD as a table.
    peopleFromJsonFile.registerTempTable("people");

    // SQL statements can be run by using the sql methods provided by sqlCtx.
    JavaSchemaRDD teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");

    // The results of SQL queries are JavaSchemaRDDs and support all the normal RDD operations.
    // The columns of a row in the result can be accessed by ordinal.
    teenagerNames = teenagers3.map(new Function<Row, String>() {
      @Override
      public String call(Row row) { return "Name: " + row.getString(0); }
    }).collect();
    for (String name: teenagerNames) {
      System.out.println(name);
    }

    // Alternatively, a JavaSchemaRDD can be created for a JSON dataset represented by
    // a RDD[String] storing one JSON object per string.
    List<String> jsonData = Arrays.asList(
          "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
    JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
    JavaSchemaRDD peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD);

    // Take a look at the schema of this new JavaSchemaRDD.
    peopleFromJsonRDD.printSchema();
    // The schema of anotherPeople is ...
    // root
    //  |-- address: StructType
    //  |    |-- city: StringType
    //  |    |-- state: StringType
    //  |-- name: StringType

    peopleFromJsonRDD.registerTempTable("people2");

    JavaSchemaRDD peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2");
    List<String> nameAndCity = peopleWithCity.map(new Function<Row, String>() {
      @Override
      public String call(Row row) {
        return "Name: " + row.getString(0) + ", City: " + row.getString(1);
      }
    }).collect();
    for (String name: nameAndCity) {
      System.out.println(name);
    }

    ctx.stop();
  }
View Full Code Here


public class JavaLogisticRegressionSuite implements Serializable {
  private transient JavaSparkContext sc;

  @Before
  public void setUp() {
    sc = new JavaSparkContext("local", "JavaLogisticRegressionSuite");
  }
View Full Code Here

    } else if (args.length > 1) {
      System.err.println("Usage: JavaDecisionTree <libsvm format data file>");
      System.exit(1);
    }
    SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTree");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().cache();

    // Compute the number of classes from the data.
    Integer numClasses = data.map(new Function<LabeledPoint, Double>() {
      @Override public Double call(LabeledPoint p) {
        return p.label();
      }
    }).countByValue().size();

    // Set parameters.
    //  Empty categoricalFeaturesInfo indicates all features are continuous.
    HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
    String impurity = "gini";
    Integer maxDepth = 5;
    Integer maxBins = 32;

    // Train a DecisionTree model for classification.
    final DecisionTreeModel model = DecisionTree.trainClassifier(data, numClasses,
      categoricalFeaturesInfo, impurity, maxDepth, maxBins);

    // Evaluate model on training instances and compute training error
    JavaPairRDD<Double, Double> predictionAndLabel =
      data.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
        @Override public Tuple2<Double, Double> call(LabeledPoint p) {
          return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
        }
      });
    Double trainErr =
      1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
        @Override public Boolean call(Tuple2<Double, Double> pl) {
          return !pl._1().equals(pl._2());
        }
      }).count() / data.count();
    System.out.println("Training error: " + trainErr);
    System.out.println("Learned classification tree model:\n" + model);

    // Train a DecisionTree model for regression.
    impurity = "variance";
    final DecisionTreeModel regressionModel = DecisionTree.trainRegressor(data,
        categoricalFeaturesInfo, impurity, maxDepth, maxBins);

    // Evaluate model on training instances and compute training error
    JavaPairRDD<Double, Double> regressorPredictionAndLabel =
      data.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
        @Override public Tuple2<Double, Double> call(LabeledPoint p) {
          return new Tuple2<Double, Double>(regressionModel.predict(p.features()), p.label());
        }
      });
    Double trainMSE =
      regressorPredictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
        @Override public Double call(Tuple2<Double, Double> pl) {
          Double diff = pl._1() - pl._2();
          return diff * diff;
        }
      }).reduce(new Function2<Double, Double, Double>() {
        @Override public Double call(Double a, Double b) {
          return a + b;
        }
      }) / data.count();
    System.out.println("Training Mean Squared Error: " + trainMSE);
    System.out.println("Learned regression tree model:\n" + regressionModel);

    sc.stop();
  }
View Full Code Here

    int blocks = -1;
    if (args.length == 5) {
      blocks = Integer.parseInt(args[4]);
    }

    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = sc.textFile(args[0]);

    JavaRDD<Rating> ratings = lines.map(new ParseRating());

    MatrixFactorizationModel model = ALS.train(ratings.rdd(), rank, iterations, 0.01, blocks);

    model.userFeatures().toJavaRDD().map(new FeaturesToString()).saveAsTextFile(
        outputDir + "/userFeatures");
    model.productFeatures().toJavaRDD().map(new FeaturesToString()).saveAsTextFile(
        outputDir + "/productFeatures");
    System.out.println("Final user/product features written to " + outputDir);

    sc.stop();
  }
View Full Code Here

    if (args.length != 3) {
      System.err.println("Usage: JavaLR <input_dir> <step_size> <niters>");
      System.exit(1);
    }
    SparkConf sparkConf = new SparkConf().setAppName("JavaLR");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = sc.textFile(args[0]);
    JavaRDD<LabeledPoint> points = lines.map(new ParsePoint()).cache();
    double stepSize = Double.parseDouble(args[1]);
    int iterations = Integer.parseInt(args[2]);

    // Another way to configure LogisticRegression
    //
    // LogisticRegressionWithSGD lr = new LogisticRegressionWithSGD();
    // lr.optimizer().setNumIterations(iterations)
    //               .setStepSize(stepSize)
    //               .setMiniBatchFraction(1.0);
    // lr.setIntercept(true);
    // LogisticRegressionModel model = lr.train(points.rdd());

    LogisticRegressionModel model = LogisticRegressionWithSGD.train(points.rdd(),
      iterations, stepSize);

    System.out.print("Final w: " + model.weights());

    sc.stop();
  }
View Full Code Here

    if (args.length >= 4) {
      runs = Integer.parseInt(args[3]);
    }
    SparkConf sparkConf = new SparkConf().setAppName("JavaKMeans");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = sc.textFile(inputFile);

    JavaRDD<Vector> points = lines.map(new ParsePoint());

    KMeansModel model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.K_MEANS_PARALLEL());

    System.out.println("Cluster centers:");
    for (Vector center : model.clusterCenters()) {
      System.out.println(" " + center);
    }
    double cost = model.computeCost(points.rdd());
    System.out.println("Cost: " + cost);

    sc.stop();
  }
View Full Code Here

    }
    if (args.length > 2) {
      usage();
    }
    SparkConf sparkConf = new SparkConf().setAppName("JavaGradientBoostedTrees");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().cache();

    // Set parameters.
    //  Note: All features are treated as continuous.
    BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams(algo);
    boostingStrategy.setNumIterations(10);
    boostingStrategy.weakLearnerParams().setMaxDepth(5);

    if (algo.equals("Classification")) {
      // Compute the number of classes from the data.
      Integer numClasses = data.map(new Function<LabeledPoint, Double>() {
        @Override public Double call(LabeledPoint p) {
          return p.label();
        }
      }).countByValue().size();
      boostingStrategy.setNumClassesForClassification(numClasses); // ignored for Regression

      // Train a GradientBoosting model for classification.
      final WeightedEnsembleModel model = GradientBoosting.trainClassifier(data, boostingStrategy);

      // Evaluate model on training instances and compute training error
      JavaPairRDD<Double, Double> predictionAndLabel =
          data.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
            @Override public Tuple2<Double, Double> call(LabeledPoint p) {
              return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
            }
          });
      Double trainErr =
          1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
            @Override public Boolean call(Tuple2<Double, Double> pl) {
              return !pl._1().equals(pl._2());
            }
          }).count() / data.count();
      System.out.println("Training error: " + trainErr);
      System.out.println("Learned classification tree model:\n" + model);
    } else if (algo.equals("Regression")) {
      // Train a GradientBoosting model for classification.
      final WeightedEnsembleModel model = GradientBoosting.trainRegressor(data, boostingStrategy);

      // Evaluate model on training instances and compute training error
      JavaPairRDD<Double, Double> predictionAndLabel =
          data.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
            @Override public Tuple2<Double, Double> call(LabeledPoint p) {
              return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
            }
          });
      Double trainMSE =
          predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
            @Override public Double call(Tuple2<Double, Double> pl) {
              Double diff = pl._1() - pl._2();
              return diff * diff;
            }
          }).reduce(new Function2<Double, Double, Double>() {
            @Override public Double call(Double a, Double b) {
              return a + b;
            }
          }) / data.count();
      System.out.println("Training Mean Squared Error: " + trainMSE);
      System.out.println("Learned regression tree model:\n" + model);
    } else {
      usage();
    }

    sc.stop();
  }
View Full Code Here

public class JavaNaiveBayesSuite implements Serializable {
  private transient JavaSparkContext sc;

  @Before
  public void setUp() {
    sc = new JavaSparkContext("local", "JavaNaiveBayesSuite");
  }
View Full Code Here

public class JavaSVMSuite implements Serializable {
  private transient JavaSparkContext sc;

  @Before
  public void setUp() {
    sc = new JavaSparkContext("local", "JavaSVMSuite");
  }
View Full Code Here

    sparkConf.setIfMissing("spark.logConf", "true");
    sparkConf.setMaster(streamingMaster);
    sparkConf.setAppName("OryxSpeedLayer");
    final long batchDurationMS =
        TimeUnit.MILLISECONDS.convert(generationIntervalSec, TimeUnit.SECONDS);
    final JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);

    JavaStreamingContextFactory streamingContextFactory = new JavaStreamingContextFactory() {
      @Override
      public JavaStreamingContext create() {
        return new JavaStreamingContext(sparkContext, new Duration(batchDurationMS));
      }
    };

    if (checkpointDirString == null) {
      log.info("Not using a streaming checkpoint dir");
      streamingContext = streamingContextFactory.create();
    } else {
      log.info("Using streaming checkpoint dir {}", checkpointDirString);
      streamingContext = JavaStreamingContext.getOrCreate(
          checkpointDirString, sparkContext.hadoopConfiguration(), streamingContextFactory, false);
      streamingContext.checkpoint(checkpointDirString);
    }

    log.info("Creating message queue stream");
View Full Code Here

TOP

Related Classes of org.apache.spark.api.java.JavaSparkContext

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.