Package com.cloudera.cdk.data

Examples of com.cloudera.cdk.data.DatasetRepository


    if (avroSchemaFile == null && avroSchemaReflectClass == null) {
      throw new IllegalArgumentException("One of cdk.avroSchemaFile or " +
          "cdk.avroSchemaReflectClass must be specified");
    }

    DatasetRepository repo = getDatasetRepository();

    DatasetDescriptor.Builder descriptorBuilder = new DatasetDescriptor.Builder();
    configureSchema(descriptorBuilder, avroSchemaFile, avroSchemaReflectClass);

    if (format.equals(Formats.AVRO.getName())) {
      descriptorBuilder.format(Formats.AVRO);
    } else if (format.equals(Formats.PARQUET.getName())) {
      descriptorBuilder.format(Formats.PARQUET);
    } else {
      throw new MojoExecutionException("Unrecognized format: " + format);
    }

    if (partitionExpression != null) {
      descriptorBuilder.partitionStrategy(Accessor.getDefault().fromExpression(partitionExpression));
    }

    repo.create(datasetName, descriptorBuilder.build());
  }
View Full Code Here


  private String datasetName;

  @Override
  public void execute() throws MojoExecutionException, MojoFailureException {
    logger.warn("CDK drop-dataset is deprecated -- please use delete-dataset");
    DatasetRepository repo = getDatasetRepository();
    repo.delete(datasetName);
  }
View Full Code Here

    }
    return conf;
  }

  DatasetRepository getDatasetRepository() {
    DatasetRepository repo;
    if (repositoryUri != null) {
      return DatasetRepositories.open(repositoryUri);
    }
    if (!hcatalog && rootDirectory == null) {
      throw new IllegalArgumentException("Root directory must be specified if not " +
View Full Code Here

    int exitCode = tool.run(input, datasetUri, datasetName);

    Assert.assertEquals(0, exitCode);

    DatasetRepository repo = DatasetRepositories.open(datasetUri);
    Dataset<GenericRecord> dataset = repo.load(datasetName);
    DatasetReader<GenericRecord> reader = dataset.newReader();
    try {
      reader.open();
      Assert.assertTrue(reader.hasNext());
      GenericRecord first = reader.next();
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct an HCatalog dataset repository using managed Hive tables
    DatasetRepository repo = DatasetRepositories.open("repo:hive");

    // Create a dataset of users with the Avro schema in the repository
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
        .schemaUri("resource:user.avsc")
        .build();
    Dataset<GenericRecord> users = repo.create("users", descriptor);

    // Get a writer for the dataset and write some users to it
    DatasetWriter<GenericRecord> writer = users.newWriter();
    try {
      writer.open();
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Load the users dataset
    Dataset<GenericRecord> users = repo.load("users");

    // Get a reader for the dataset and read all the users
    DatasetReader<GenericRecord> reader = users.newReader();
    try {
      reader.open();
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Create a dataset of users with the Avro schema in the repository
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
        .schemaUri("resource:user.avsc")
        .build();
    Dataset<GenericRecord> users = repo.create("users", descriptor);

    // Get a writer for the dataset and write some users to it
    DatasetWriter<GenericRecord> writer = users.newWriter();
    try {
      writer.open();
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Load the users dataset
    Dataset<GenericRecord> users = repo.load("users");

    // Get the partition strategy and use it to construct a partition key for
    // hash(username)=0
    PartitionStrategy partitionStrategy = users.getDescriptor().getPartitionStrategy();
    PartitionKey partitionKey = partitionStrategy.partitionKey(0);
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct a filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");

    // Create a partition strategy that hash partitions on username with 10 buckets
    PartitionStrategy partitionStrategy =
        new PartitionStrategy.Builder().hash("username", 10).build();

    // Create a dataset of users with the Avro schema in the repository
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
        .schemaUri("resource:user.avsc")
        .partitionStrategy(partitionStrategy)
        .build();
    Dataset<GenericRecord> users = repo.create("users", descriptor);

    // Get a writer for the dataset and write some users to it
    DatasetWriter<GenericRecord> writer = users.newWriter();
    try {
      writer.open();
View Full Code Here

  @Override
  public int run(String[] args) throws Exception {

    // Construct an HCatalog dataset repository using managed Hive tables
    DatasetRepository repo = DatasetRepositories.open("repo:hive");

    // Load the users dataset
    Dataset<GenericRecord> users = repo.load("users");

    // Get a reader for the dataset and read all the users
    DatasetReader<GenericRecord> reader = users.newReader();
    try {
      reader.open();
View Full Code Here

TOP

Related Classes of com.cloudera.cdk.data.DatasetRepository

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.