Package com.cloudera.cdk.data

Examples of com.cloudera.cdk.data.DatasetDescriptor


    Assert.assertFalse(reader.hasNext());
  }

  @Test
  public void testTSV() {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .property("cdk.csv.delimiter", "\t")
        .property("cdk.csv.lines-to-skip", "1")
        .schema(STRINGS)
        .build();
    final CSVFileReader<GenericData.Record> reader =
View Full Code Here


    Assert.assertFalse(reader.hasNext());
  }

  @Test
  public void testNormalSchema() {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .schema(SCHEMA)
        .build();
    final CSVFileReader<GenericData.Record> reader =
        new CSVFileReader<GenericData.Record>(localfs, csvFile, desc);
View Full Code Here

    Assert.assertFalse(reader.hasNext());
  }

  @Test
  public void testReflectedRecords() {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .schema(BEAN_SCHEMA)
        .build();
    final CSVFileReader<TestBean> reader =
        new CSVFileReader<TestBean>(localfs, csvFile, desc);
View Full Code Here

        "Descriptor can not be null");
    Preconditions.checkArgument(descriptor.getLocation() == null,
        "Descriptor location cannot be set; " +
        "it is assigned by the MetadataProvider");

    final DatasetDescriptor newDescriptor = metadataProvider
        .create(name, descriptor);

    final URI location = newDescriptor.getLocation();
    if (location == null) {
      throw new DatasetRepositoryException(
          "[BUG] MetadataProvider did not assign a location to dataset:" +
          name);
    }

    ensureExists(newDescriptor, conf);

    logger.debug("Created dataset:{} schema:{} datasetPath:{}", new Object[] {
        name, newDescriptor.getSchema(), location.toString() });

    return new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(newDescriptor)
        .partitionKey(newDescriptor.isPartitioned() ?
            com.cloudera.cdk.data.impl.Accessor.getDefault().newPartitionKey() :
            null)
        .build();
  }
View Full Code Here

  public <E> Dataset<E> update(String name, DatasetDescriptor descriptor) {
    Preconditions.checkArgument(name != null, "Dataset name cannot be null");
    Preconditions.checkArgument(descriptor != null,
        "DatasetDescriptro cannot be null");

    DatasetDescriptor oldDescriptor = metadataProvider.load(name);

    // oldDescriptor is valid if load didn't throw NoSuchDatasetException

    if (!oldDescriptor.getFormat().equals(descriptor.getFormat())) {
      throw new DatasetRepositoryException("Cannot change dataset format from " +
          oldDescriptor.getFormat() + " to " + descriptor.getFormat());
    }

    final URI oldLocation = oldDescriptor.getLocation();
    if ((oldLocation != null) && !(oldLocation.equals(descriptor.getLocation()))) {
      throw new DatasetRepositoryException(
          "Cannot change the dataset's location");
    }

    if (oldDescriptor.isPartitioned() != descriptor.isPartitioned()) {
      throw new DatasetRepositoryException("Cannot change an unpartitioned dataset to " +
          " partitioned or vice versa.");
    } else if (oldDescriptor.isPartitioned() && descriptor.isPartitioned() &&
        !oldDescriptor.getPartitionStrategy().equals(descriptor.getPartitionStrategy())) {
      throw new DatasetRepositoryException("Cannot change partition strategy from " +
          oldDescriptor.getPartitionStrategy() + " to " + descriptor.getPartitionStrategy());
    }

    // check can read records written with old schema using new schema
    final Schema oldSchema = oldDescriptor.getSchema();
    final Schema newSchema = descriptor.getSchema();
    if (!SchemaValidationUtil.canRead(oldSchema, newSchema)) {
      throw new IncompatibleSchemaException("New schema cannot read data " +
          "written using " +
          "old schema. New schema: " + newSchema.toString(true) + "\nOld schema: " +
          oldSchema.toString(true));
    }

    final DatasetDescriptor updatedDescriptor = metadataProvider
        .update(name, descriptor);

    logger.debug("Updated dataset:{} schema:{} datasetPath:{}", new Object[] {
        name, updatedDescriptor.getSchema(),
        updatedDescriptor.getLocation().toString() });

    return new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(updatedDescriptor)
        .partitionKey(updatedDescriptor.isPartitioned() ?
            com.cloudera.cdk.data.impl.Accessor.getDefault().newPartitionKey() :
            null)
        .build();
  }
View Full Code Here

    HBaseDatasetRepository repo = new HBaseDatasetRepository.Builder()
        .configuration(conf).build();

    // TODO: change to use namespace (CDK-140)

    DatasetDescriptor userProfileDatasetDescriptor =
        new DatasetDescriptor.Builder().schema(UserProfileModel2.SCHEMA$).build();
    userProfileDataset = repo.create("cdk_example_user_profiles.UserProfileModel2",
        userProfileDatasetDescriptor);

    DatasetDescriptor userActionsDatasetDescriptor =
        new DatasetDescriptor.Builder().schema(UserActionsModel2.SCHEMA$).build();
    userActionsDataset = repo.create("cdk_example_user_profiles.UserActionsModel2",
        userActionsDatasetDescriptor);

    DatasetDescriptor userProfileActionsDatasetDescriptor =
        new DatasetDescriptor.Builder().schema(UserProfileActionsModel2.SCHEMA$).build();
    userProfileActionsDataset = repo.create("cdk_example_user_profiles.UserProfileActionsProtocol2",
        userProfileActionsDatasetDescriptor);

  }
View Full Code Here

  public <E> Dataset<E> load(String name) {
    Preconditions.checkArgument(name != null, "Name can not be null");

    logger.debug("Loading dataset:{}", name);

    DatasetDescriptor descriptor = metadataProvider.load(name);

    FileSystemDataset<E> ds = new FileSystemDataset.Builder()
        .name(name)
        .configuration(conf)
        .descriptor(descriptor)
        .partitionKey(descriptor.isPartitioned() ?
            com.cloudera.cdk.data.impl.Accessor.getDefault().newPartitionKey() :
            null)
        .build();

    logger.debug("Loaded dataset:{}", ds);
View Full Code Here

  public boolean delete(String name) {
    Preconditions.checkArgument(name != null, "Name can not be null");

    logger.debug("Deleting dataset:{}", name);

    final DatasetDescriptor descriptor;
    try {
      descriptor = metadataProvider.load(name);
    } catch (com.cloudera.cdk.data.NoSuchDatasetException ex) {
      return false;
    }

    boolean changed;
    try {
      // don't care about the return value here -- if it already doesn't exist
      // we still need to delete the data directory
      changed = metadataProvider.delete(name);
    } catch (MetadataProviderException ex) {
      throw new DatasetRepositoryException(
          "Failed to delete descriptor for name:" + name, ex);
    }

    final Path dataLocation = new Path(descriptor.getLocation());
    final FileSystem fs = fsForPath(dataLocation, conf);

    try {
      if (fs.exists(dataLocation)) {
        if (fs.delete(dataLocation, true)) {
View Full Code Here

  // This class is Immutable and must be thread-safe
  protected final ThreadLocal<StorageKey> keys;

  protected AbstractRangeView(Dataset<E> dataset) {
    this.dataset = dataset;
    final DatasetDescriptor descriptor = dataset.getDescriptor();
    if (descriptor.isPartitioned()) {
      this.range = new MarkerRange(new MarkerComparator(
          descriptor.getPartitionStrategy()));
      this.keys = new ThreadLocal<StorageKey>() {
        @Override
        protected StorageKey initialValue() {
          return new StorageKey(descriptor.getPartitionStrategy());
        }
      };
    } else {
      // use UNDEFINED, which handles inappropriate calls to range methods
      this.range = MarkerRange.UNDEFINED;
View Full Code Here

    }

    final Path metadataLocation = pathForMetadata(name);

    // get a DatasetDescriptor with the location set
    DatasetDescriptor newDescriptor = new DatasetDescriptor.Builder(descriptor)
        .location(dataLocation)
        .build();

    try {
      if (rootFileSystem.exists(metadataLocation)) {
View Full Code Here

TOP

Related Classes of com.cloudera.cdk.data.DatasetDescriptor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.