Package com.cloudera.cdk.data

Examples of com.cloudera.cdk.data.DatasetRepository


  @Override
  public int run(String[] args) throws Exception {

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Create a dataset of users with the Avro schema, and Parquet format in the
    // repository
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
        .schemaUri("resource:user.avsc")
        .format(Formats.PARQUET)
        .build();
    Dataset<GenericRecord> users = repo.create("users", descriptor);

    // Get a writer for the dataset and write some users to it
    DatasetWriter<GenericRecord> writer = users.newWriter();
    try {
      writer.open();
View Full Code Here


  @Override
  public int run(String[] args) throws Exception {

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Delete the products dataset
    boolean success = repo.delete("products");

    return success ? 0 : 1;
  }
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Load the products dataset
    Dataset<Product> products = repo.load("products");

    // Get a reader for the dataset and read all the products
    DatasetReader<Product> reader = products.newReader();
    try {
      reader.open();
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct a local filesystem dataset repository rooted at /tmp/hello-cdk
    DatasetRepository repo = DatasetRepositories.open("repo:file:/tmp/hello-cdk");

    // Create a dataset of Hellos
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
        .schema(Hello.class).build();
    Dataset<Hello> hellos = repo.create("hellos", descriptor);

    // Write some Hellos in to the dataset
    DatasetWriter<Hello> writer = hellos.newWriter();
    try {
      writer.open();
     
      Hello cdk = new Hello("CDK");
      writer.write(cdk);
    } finally {
      writer.close();
    }
   
    // Read the Hellos from the dataset
    DatasetReader<Hello> reader = hellos.newReader();
    try {
      reader.open();
      for (Hello hello : reader) {
        hello.sayHello();
      }
    } finally {
      reader.close();
    }
   
    // Delete the dataset now that we are done with it
    repo.delete("hellos");

    return 0;
  }
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Delete the users dataset
    boolean success = repo.delete("users");

    return success ? 0 : 1;
  }
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct an HCatalog dataset repository using managed Hive tables
    DatasetRepository repo = DatasetRepositories.open("repo:hive");

    // Delete the users dataset
    boolean success = repo.delete("users");

    return success ? 0 : 1;
  }
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Create a dataset of products with the Avro schema in the repository
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
        .schema(Product.class)
        .build();
    Dataset<Product> products = repo.create("products", descriptor);

    // Get a writer for the dataset and write some products to it
    DatasetWriter<Product> writer = products.newWriter();
    try {
      writer.open();
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct a local filesystem dataset repository rooted at /tmp/data
    DatasetRepository fsRepo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Construct an HCatalog dataset repository using external Hive tables
    DatasetRepository hcatRepo = DatasetRepositories.open("repo:hive:/tmp/data");

    // Turn debug on while in development.
    getPipeline().enableDebug();
    getPipeline().getConfiguration().set("crunch.log.job.progress", "true");

    // Load the events dataset and get the correct partition to sessionize
    Dataset<StandardEvent> eventsDataset = fsRepo.load("events");
    Dataset<StandardEvent> partition;
    if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) {
      partition = getLatestPartition(eventsDataset);
    } else {
      partition = getPartitionForURI(eventsDataset, args[0]);
    }

    // Create a parallel collection from the working partition
    PCollection<StandardEvent> events = read(
        CrunchDatasets.asSource(partition, StandardEvent.class));

    // Group events by user and cookie id, then create a session for each group
    PCollection<Session> sessions = events
        .by(new GetSessionKey(), Avros.strings())
        .groupByKey()
        .parallelDo(new MakeSession(), Avros.specifics(Session.class));

    // Write the sessions to the "sessions" Dataset
    getPipeline().write(sessions, CrunchDatasets.asTarget(hcatRepo.load("sessions")),
        Target.WriteMode.APPEND);

    return run().succeeded() ? 0 : 1;
  }
View Full Code Here

  private Schema schema;

  @Override
  public void init() throws ServletException {
    // Find the schema from the repository
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
    this.schema = repo.load("events").getDescriptor().getSchema();
  }
View Full Code Here

  }

  @Override
  public int run(String[] args) throws Exception {
    // open the repository
    final DatasetRepository repo = DatasetRepositories.open("repo:file:/tmp/data");

    final Calendar now = Calendar.getInstance();
    final long yesterdayTimestamp = now.getTimeInMillis() - DAY_IN_MILLIS;

    // the destination dataset
    final Dataset<GenericRecord> persistent = repo.load("logs");
    final DatasetWriter<GenericRecord> writer = persistent.newWriter();
    writer.open();

    // the source dataset: yesterday's partition in the staging area
    final Dataset<GenericRecord> staging = repo.load("logs-staging");
    final PartitionKey yesterday = getPartitionKey(staging, yesterdayTimestamp);
    final DatasetReader<GenericRecord> reader = staging
        .getPartition(yesterday, false).newReader();

    try {
View Full Code Here

TOP

Related Classes of com.cloudera.cdk.data.DatasetRepository

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.