Examples of com.cloudera.cdk.data.DatasetDescriptor

com.cloudera.cdk.data.DatasetDescriptor

The structural definition of a {@link Dataset}.

Each {@code Dataset} has an associated {@link Schema} and optional{@link PartitionStrategy} defined at the time of creation. Instances of thisclass are used to hold this information. Users are strongly encouraged to use the inner {@link Builder} to create new instances.

    Assert.assertFalse(reader.hasNext());
  }


  @Test
  public void testTSV() {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .property("cdk.csv.delimiter", "\t")
        .property("cdk.csv.lines-to-skip", "1")
        .schema(STRINGS)
        .build();
    final CSVFileReader<GenericData.Record> reader =

View Full Code Here

    Assert.assertFalse(reader.hasNext());
  }


  @Test
  public void testNormalSchema() {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .schema(SCHEMA)
        .build();
    final CSVFileReader<GenericData.Record> reader =
        new CSVFileReader<GenericData.Record>(localfs, csvFile, desc);

View Full Code Here

    Assert.assertFalse(reader.hasNext());
  }


  @Test
  public void testReflectedRecords() {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .schema(BEAN_SCHEMA)
        .build();
    final CSVFileReader<TestBean> reader =
        new CSVFileReader<TestBean>(localfs, csvFile, desc);

View Full Code Here

        "Descriptor can not be null");
    Preconditions.checkArgument(descriptor.getLocation() == null,
        "Descriptor location cannot be set; " +
        "it is assigned by the MetadataProvider");


    final DatasetDescriptor newDescriptor = metadataProvider
        .create(name, descriptor);


    final URI location = newDescriptor.getLocation();
    if (location == null) {
      throw new DatasetRepositoryException(
          "[BUG] MetadataProvider did not assign a location to dataset:" +
          name);
    }


    ensureExists(newDescriptor, conf);


    logger.debug("Created dataset:{} schema:{} datasetPath:{}", new Object[] {
        name, newDescriptor.getSchema(), location.toString() });


    return new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(newDescriptor)
        .partitionKey(newDescriptor.isPartitioned() ?
            com.cloudera.cdk.data.impl.Accessor.getDefault().newPartitionKey() :
            null)
        .build();
  }

View Full Code Here

  public <E> Dataset<E> update(String name, DatasetDescriptor descriptor) {
    Preconditions.checkArgument(name != null, "Dataset name cannot be null");
    Preconditions.checkArgument(descriptor != null,
        "DatasetDescriptro cannot be null");


    DatasetDescriptor oldDescriptor = metadataProvider.load(name);


    // oldDescriptor is valid if load didn't throw NoSuchDatasetException


    if (!oldDescriptor.getFormat().equals(descriptor.getFormat())) {
      throw new DatasetRepositoryException("Cannot change dataset format from " +
          oldDescriptor.getFormat() + " to " + descriptor.getFormat());
    }


    final URI oldLocation = oldDescriptor.getLocation();
    if ((oldLocation != null) && !(oldLocation.equals(descriptor.getLocation()))) {
      throw new DatasetRepositoryException(
          "Cannot change the dataset's location");
    }


    if (oldDescriptor.isPartitioned() != descriptor.isPartitioned()) {
      throw new DatasetRepositoryException("Cannot change an unpartitioned dataset to " +
          " partitioned or vice versa.");
    } else if (oldDescriptor.isPartitioned() && descriptor.isPartitioned() &&
        !oldDescriptor.getPartitionStrategy().equals(descriptor.getPartitionStrategy())) {
      throw new DatasetRepositoryException("Cannot change partition strategy from " +
          oldDescriptor.getPartitionStrategy() + " to " + descriptor.getPartitionStrategy());
    }


    // check can read records written with old schema using new schema
    final Schema oldSchema = oldDescriptor.getSchema();
    final Schema newSchema = descriptor.getSchema();
    if (!SchemaValidationUtil.canRead(oldSchema, newSchema)) {
      throw new IncompatibleSchemaException("New schema cannot read data " +
          "written using " +
          "old schema. New schema: " + newSchema.toString(true) + "\nOld schema: " +
          oldSchema.toString(true));
    }


    final DatasetDescriptor updatedDescriptor = metadataProvider
        .update(name, descriptor);


    logger.debug("Updated dataset:{} schema:{} datasetPath:{}", new Object[] {
        name, updatedDescriptor.getSchema(),
        updatedDescriptor.getLocation().toString() });


    return new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(updatedDescriptor)
        .partitionKey(updatedDescriptor.isPartitioned() ?
            com.cloudera.cdk.data.impl.Accessor.getDefault().newPartitionKey() :
            null)
        .build();
  }

View Full Code Here

    HBaseDatasetRepository repo = new HBaseDatasetRepository.Builder()
        .configuration(conf).build();


    // TODO: change to use namespace (CDK-140)


    DatasetDescriptor userProfileDatasetDescriptor =
        new DatasetDescriptor.Builder().schema(UserProfileModel2.SCHEMA$).build();
    userProfileDataset = repo.create("cdk_example_user_profiles.UserProfileModel2",
        userProfileDatasetDescriptor);


    DatasetDescriptor userActionsDatasetDescriptor =
        new DatasetDescriptor.Builder().schema(UserActionsModel2.SCHEMA$).build();
    userActionsDataset = repo.create("cdk_example_user_profiles.UserActionsModel2",
        userActionsDatasetDescriptor);


    DatasetDescriptor userProfileActionsDatasetDescriptor =
        new DatasetDescriptor.Builder().schema(UserProfileActionsModel2.SCHEMA$).build();
    userProfileActionsDataset = repo.create("cdk_example_user_profiles.UserProfileActionsProtocol2",
        userProfileActionsDatasetDescriptor);


  }

View Full Code Here

  public <E> Dataset<E> load(String name) {
    Preconditions.checkArgument(name != null, "Name can not be null");


    logger.debug("Loading dataset:{}", name);


    DatasetDescriptor descriptor = metadataProvider.load(name);


    FileSystemDataset<E> ds = new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(descriptor)
        .partitionKey(descriptor.isPartitioned() ?
            com.cloudera.cdk.data.impl.Accessor.getDefault().newPartitionKey() :
            null)
        .build();


    logger.debug("Loaded dataset:{}", ds);

View Full Code Here

  public boolean delete(String name) {
    Preconditions.checkArgument(name != null, "Name can not be null");


    logger.debug("Deleting dataset:{}", name);


    final DatasetDescriptor descriptor;
    try {
      descriptor = metadataProvider.load(name);
    } catch (com.cloudera.cdk.data.NoSuchDatasetException ex) {
      return false;
    }


    boolean changed;
    try {
      // don't care about the return value here -- if it already doesn't exist
      // we still need to delete the data directory
      changed = metadataProvider.delete(name);
    } catch (MetadataProviderException ex) {
      throw new DatasetRepositoryException(
          "Failed to delete descriptor for name:" + name, ex);
    }


    final Path dataLocation = new Path(descriptor.getLocation());
    final FileSystem fs = fsForPath(dataLocation, conf);


    try {
      if (fs.exists(dataLocation)) {
        if (fs.delete(dataLocation, true)) {

View Full Code Here

  // This class is Immutable and must be thread-safe
  protected final ThreadLocal<StorageKey> keys;


  protected AbstractRangeView(Dataset<E> dataset) {
    this.dataset = dataset;
    final DatasetDescriptor descriptor = dataset.getDescriptor();
    if (descriptor.isPartitioned()) {
      this.range = new MarkerRange(new MarkerComparator(
          descriptor.getPartitionStrategy()));
      this.keys = new ThreadLocal<StorageKey>() {
        @Override
        protected StorageKey initialValue() {
          return new StorageKey(descriptor.getPartitionStrategy());
        }
      };
    } else {
      // use UNDEFINED, which handles inappropriate calls to range methods
      this.range = MarkerRange.UNDEFINED;

View Full Code Here

    }


    final Path metadataLocation = pathForMetadata(name);


    // get a DatasetDescriptor with the location set
    DatasetDescriptor newDescriptor = new DatasetDescriptor.Builder(descriptor)
        .location(dataLocation)
        .build();


    try {
      if (rootFileSystem.exists(metadataLocation)) {

View Full Code Here

0 1 2 3 4

TOP

Related Classes of com.cloudera.cdk.data.DatasetDescriptor

com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository

com.cloudera.cdk.data.filesystem.FileSystemMetadataProvider

com.cloudera.cdk.data.filesystem.PartitionedDatasetWriter

com.cloudera.cdk.data.filesystem.TestCSVFileReader

com.cloudera.cdk.data.filesystem.TestFileSystemDatasetRepository

com.cloudera.cdk.data.filesystem.TestFileSystemMetadataProvider

com.cloudera.cdk.data.filesystem.TestMultiFileDatasetReader

com.cloudera.cdk.data.hbase.avro.example.UserProfileDatasetExample

com.cloudera.cdk.data.hbase.avro.HBaseDatasetReaderTest

com.cloudera.cdk.data.hbase.DaoViewTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.