Package com.cloudera.cdk.data.filesystem

Source Code of com.cloudera.cdk.data.filesystem.TestFileSystemDatasetRepository

/**
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.data.filesystem;

import com.cloudera.cdk.data.TestDatasetRepositories;
import com.cloudera.cdk.data.Dataset;
import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetRepository;
import com.cloudera.cdk.data.DatasetRepositoryException;
import com.cloudera.cdk.data.Formats;
import com.cloudera.cdk.data.MetadataProvider;
import com.cloudera.cdk.data.PartitionStrategy;
import java.io.IOException;
import java.net.URI;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.hadoop.fs.Path;
import org.junit.Assert;
import org.junit.Test;

import static com.cloudera.cdk.data.filesystem.DatasetTestUtilities.checkTestUsers;
import static com.cloudera.cdk.data.filesystem.DatasetTestUtilities.writeTestUsers;
import org.apache.avro.generic.GenericData.Record;

public class TestFileSystemDatasetRepository extends TestDatasetRepositories {

  public TestFileSystemDatasetRepository(boolean distributed) {
    super(distributed);
  }

  @Override
  public DatasetRepository newRepo(MetadataProvider provider) {
    // this purposely does not set the Configuration to test that the code
    // relies on filesystem URIs set in the DatasetDescriptor.
    return new FileSystemDatasetRepository.Builder()
        .metadataProvider(provider)
        .build();
  }

  @Test
  public void testCreatePath() throws IOException {
    Dataset<Record> created = repo.create(NAME, testDescriptor);

    URI location = created.getDescriptor().getLocation();
    Assert.assertNotNull(
        "FileSystemDatasetRepository should return descriptor locations",
        location);
    Assert.assertTrue("Dataset data directory:" + location + " should exist",
        fileSystem.exists(new Path(location)));
  }

  @Test
  public void testLoadNewHasZeroSize() {
    ensureCreated();

    Dataset<Record> dataset = repo.load(NAME);

    /*
     * We perform a read test to make sure only data files are encountered
     * during a read.
     */
    Assert.assertEquals(0, DatasetTestUtilities.datasetSize(dataset));
  }

  @Test
  public void testUpdateFailsWithFormatChange() {
    Dataset<Record> dataset = repo.create(NAME,
        new DatasetDescriptor.Builder(testDescriptor)
            .format(Formats.AVRO)
            .build());

    DatasetDescriptor changed =
        new DatasetDescriptor.Builder(dataset.getDescriptor())
        .format(Formats.PARQUET)
        .build();

    try {
      repo.update(NAME, changed);
      Assert.fail("Should fail due to format change");
    } catch (DatasetRepositoryException e) {
      // expected
    }

    Assert.assertEquals(
        Formats.AVRO, repo.load(NAME).getDescriptor().getFormat());
  }

  @Test
  public void testUpdateFailsWithPartitionStrategyChange() {
    PartitionStrategy ps1 = new PartitionStrategy.Builder()
        .hash("username", 2)
        .build();
    PartitionStrategy ps2 = new PartitionStrategy.Builder()
        .hash("username", 2)
        .hash("email", 3)
        .build();

    Dataset<Record> dataset = repo.create(NAME,
        new DatasetDescriptor.Builder(testDescriptor)
            .partitionStrategy(ps1)
            .build());

    DatasetDescriptor changed =
        new DatasetDescriptor.Builder(dataset.getDescriptor())
            .partitionStrategy(ps2)
            .build();

    try {
      repo.update(NAME, changed);
      Assert.fail("Should fail due to partition strategy change");
    } catch (DatasetRepositoryException e) {
      // expected
    }

    Assert.assertEquals(
        ps1, repo.load(NAME).getDescriptor().getPartitionStrategy());
  }

  @Test
  public void testUpdateFailsWithLocationChange() {
    ensureCreated();
    Dataset<Record> dataset = repo.load(NAME);
    URI location = dataset.getDescriptor().getLocation();

    DatasetDescriptor changed =
        new DatasetDescriptor.Builder(dataset.getDescriptor())
            .location(new Path(testDirectory, "newDataLocation").toUri())
            .build();

    try {
      repo.update(NAME, changed);
      Assert.fail("Should fail due to data location change");
    } catch (DatasetRepositoryException ex) {
      // expected
    }

    Assert.assertEquals(
        location, repo.load(NAME).getDescriptor().getLocation());
  }

  @Test
  public void testUpdateFailsWithIncompatibleSchemaChange() {
    Dataset<Record> dataset = repo.create(NAME, new DatasetDescriptor.Builder()
        .schema(testSchema).build());

    Assert.assertEquals("Dataset name is propagated", NAME,
        dataset.getName());
    Assert.assertEquals("Dataset schema is propagated", testSchema, dataset
        .getDescriptor().getSchema());

    Schema testSchemaV2 = SchemaBuilder.record("user").fields()
        .requiredString("username")
        .requiredString("email")
        .requiredString("favoriteColor") // incompatible - no default
        .endRecord();

    try {
      repo.update(NAME, new DatasetDescriptor.Builder().schema(testSchemaV2).build());
      Assert.fail("Should fail due to incompatible update");
    } catch (DatasetRepositoryException e) {
      // expected
    }
    dataset = repo.load(NAME);
    Assert.assertEquals("Dataset schema is unchanged", testSchema, dataset
        .getDescriptor().getSchema());
  }

  @Test
  public void testUpdateSuccessfulWithCompatibleSchemaChangeFieldAdded() {
    Dataset<Record> dataset = repo.create(NAME, new DatasetDescriptor.Builder()
        .schema(testSchema).build());

    writeTestUsers(dataset, 5, 0, "email");
    checkTestUsers(dataset, 5, "email");

    Schema testSchemaV2 = SchemaBuilder.record("user").fields()
        .requiredString("username")
        .requiredString("email")
        .nullableString("favoriteColor", "orange")
        .endRecord();

    Dataset<Record> datasetV2 = repo.update(NAME,
        new DatasetDescriptor.Builder(dataset.getDescriptor())
            .schema(testSchemaV2)
            .build());

    Assert.assertEquals("Dataset schema is updated", testSchemaV2, datasetV2
        .getDescriptor().getSchema());

    // test that the old records can be read back with the new schema
    checkTestUsers(datasetV2, 5, "email", "favoriteColor");

    // write more users and test that the mixed set can be read back with the new schema
    writeTestUsers(datasetV2, 5, 5, "email", "favoriteColor");
    checkTestUsers(datasetV2, 10, "email", "favoriteColor");
  }

  @Test
  public void testUpdateSuccessfulWithCompatibleSchemaChangeFieldRemoved() {
    Dataset<Record> dataset = repo.create(NAME, new DatasetDescriptor.Builder()
        .schema(testSchema).build());

    writeTestUsers(dataset, 5, 0, "email");
    checkTestUsers(dataset, 5, "email");

    Schema testSchemaV2 = SchemaBuilder.record("user").fields()
        .requiredString("username")
        .endRecord();

    Dataset<Record> datasetV2 = repo.update(NAME,
        new DatasetDescriptor.Builder(dataset.getDescriptor())
            .schema(testSchemaV2)
            .build());

    Assert.assertEquals("Dataset schema is updated", testSchemaV2, datasetV2
        .getDescriptor().getSchema());

    // test that the old records can be read back with the new schema
    checkTestUsers(datasetV2, 5, new String[0]);

    // write more users and test that the mixed set can be read back with the new schema
    writeTestUsers(datasetV2, 5, 5, new String[0]);
    checkTestUsers(datasetV2, 10, new String[0]);
  }

  @Test
  public void testDeleteRemovesDatasetPath() throws IOException {
    ensureCreated();

    Dataset<Record> dataset = repo.load(NAME);
    Path dataPath = new Path(dataset.getDescriptor().getLocation());
    Assert.assertTrue(fileSystem.exists(dataPath));

    repo.delete(NAME);

    Assert.assertFalse(fileSystem.exists(dataPath));
  }
}
TOP

Related Classes of com.cloudera.cdk.data.filesystem.TestFileSystemDatasetRepository

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.