Package com.cloudera.cdk.data

Examples of com.cloudera.cdk.data.DatasetRepository.load()


    if (!initialized) {
      // initialize here rather than in activateOptions to avoid initialization
      // cycle in Configuration and log4j
      try {
        DatasetRepository repo = DatasetRepositories.open(datasetRepositoryUri);
        Dataset dataset = repo.load(datasetName);
        if (dataset.getDescriptor().isPartitioned()) {
          partitionStrategy = dataset.getDescriptor().getPartitionStrategy();
        }
        URL schemaUrl = dataset.getDescriptor().getSchemaUrl();
        if (schemaUrl != null) {
View Full Code Here


          "cdk.avroSchemaReflectClass must be specified");
    }

    DatasetRepository repo = getDatasetRepository();

    DatasetDescriptor descriptor = repo.load(datasetName).getDescriptor();
    DatasetDescriptor.Builder descriptorBuilder =
        new DatasetDescriptor.Builder(descriptor);
    configureSchema(descriptorBuilder, avroSchemaFile, avroSchemaReflectClass);

    repo.update(datasetName, descriptorBuilder.build());
View Full Code Here

    int exitCode = tool.run(input, datasetUri, datasetName);

    Assert.assertEquals(0, exitCode);

    DatasetRepository repo = DatasetRepositories.open(datasetUri);
    Dataset<GenericRecord> dataset = repo.load(datasetName);
    DatasetReader<GenericRecord> reader = dataset.newReader();
    try {
      reader.open();
      Assert.assertTrue(reader.hasNext());
      GenericRecord first = reader.next();
View Full Code Here

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Load the users dataset
    Dataset<GenericRecord> users = repo.load("users");

    // Get a reader for the dataset and read all the users
    DatasetReader<GenericRecord> reader = users.newReader();
    try {
      reader.open();
View Full Code Here

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Load the users dataset
    Dataset<GenericRecord> users = repo.load("users");

    // Get the partition strategy and use it to construct a partition key for
    // hash(username)=0
    PartitionStrategy partitionStrategy = users.getDescriptor().getPartitionStrategy();
    PartitionKey partitionKey = partitionStrategy.partitionKey(0);
View Full Code Here

    // Construct an HCatalog dataset repository using managed Hive tables
    DatasetRepository repo = DatasetRepositories.open("repo:hive");

    // Load the users dataset
    Dataset<GenericRecord> users = repo.load("users");

    // Get a reader for the dataset and read all the users
    DatasetReader<GenericRecord> reader = users.newReader();
    try {
      reader.open();
View Full Code Here

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Load the products dataset
    Dataset<Product> products = repo.load("products");

    // Get a reader for the dataset and read all the products
    DatasetReader<Product> reader = products.newReader();
    try {
      reader.open();
View Full Code Here

        .by(new GetSessionKey(), Avros.strings())
        .groupByKey()
        .parallelDo(new MakeSession(), Avros.specifics(Session.class));

    // Write the sessions to the "sessions" Dataset
    getPipeline().write(sessions, CrunchDatasets.asTarget(hcatRepo.load("sessions")),
        Target.WriteMode.APPEND);

    return run().succeeded() ? 0 : 1;
  }
View Full Code Here

  @Override
  public void init() throws ServletException {
    // Find the schema from the repository
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
    this.schema = repo.load("events").getDescriptor().getSchema();
  }

  @Override
  protected void doGet(HttpServletRequest request, HttpServletResponse
      response) throws ServletException, IOException {
View Full Code Here

    final Calendar now = Calendar.getInstance();
    final long yesterdayTimestamp = now.getTimeInMillis() - DAY_IN_MILLIS;

    // the destination dataset
    final Dataset<GenericRecord> persistent = repo.load("logs");
    final DatasetWriter<GenericRecord> writer = persistent.newWriter();
    writer.open();

    // the source dataset: yesterday's partition in the staging area
    final Dataset<GenericRecord> staging = repo.load("logs-staging");
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.