Package com.cloudera.cdk.data.filesystem

Source Code of com.cloudera.cdk.data.filesystem.TestMultiFileDatasetReader

/**
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.data.filesystem;

import com.cloudera.cdk.data.TestDatasetReaders;
import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetReader;
import com.cloudera.cdk.data.DatasetReaderException;
import com.cloudera.cdk.data.UnknownFormatException;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
import java.io.IOException;
import org.apache.avro.generic.GenericData.Record;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import static com.cloudera.cdk.data.filesystem.DatasetTestUtilities.*;
import com.cloudera.cdk.data.impl.Accessor;
import org.apache.avro.generic.GenericData;

public class TestMultiFileDatasetReader extends TestDatasetReaders {

  public static final Path TEST_FILE = new Path(
      Resources.getResource("data/strings-100.avro").getFile());
  public static final RecordValidator<Record> VALIDATOR =
      new RecordValidator<Record>() {
      @Override
      public void validate(Record record, int recordNum) {
        Assert.assertNotNull(record);
        Assert.assertEquals(String.valueOf(recordNum % 100), record.get("text"));
      }
    };
  public static final DatasetDescriptor DESCRIPTOR = new DatasetDescriptor
      .Builder().schema(STRING_SCHEMA).build();

  @Override
  public DatasetReader newReader() throws IOException {
    return new MultiFileDatasetReader<GenericData.Record>(
        FileSystem.get(new Configuration()),
        Lists.newArrayList(TEST_FILE, TEST_FILE),
        DESCRIPTOR);
  }

  @Override
  public int getTotalRecords() {
    return 200;
  }

  @Override
  public DatasetTestUtilities.RecordValidator getValidator() {
    return VALIDATOR;
  }

  private FileSystem fileSystem;
  @Before
  public void setUp() throws IOException {
    this.fileSystem = FileSystem.get(new Configuration());
  }

  @Test
  public void testEmptyPathList() throws IOException {
    MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
        fileSystem, Lists.<Path>newArrayList(), DESCRIPTOR);

    checkReaderBehavior(reader, 0, VALIDATOR);
  }

  @Test
  public void testSingleFile() throws IOException {
    MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
        fileSystem, Lists.newArrayList(TEST_FILE), DESCRIPTOR);

    checkReaderBehavior(reader, 100, VALIDATOR);
  }

  @Test(expected = IllegalArgumentException.class)
  public void testRequriesFileSystem() throws IOException {
    MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
        null, Lists.newArrayList(TEST_FILE, TEST_FILE), DESCRIPTOR);
  }

  @Test(expected = IllegalArgumentException.class)
  public void testRequriesFiles() throws IOException {
    MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
        fileSystem, null, DESCRIPTOR);
  }

  @Test(expected = IllegalArgumentException.class)
  public void testRequriesDescriptor() throws IOException {
    MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
        fileSystem, Lists.newArrayList(TEST_FILE, TEST_FILE), null);
  }

  @Test(expected = IllegalArgumentException.class)
  public void testRejectsNullPaths() throws IOException {
    MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
        fileSystem, Lists.newArrayList(null, TEST_FILE), DESCRIPTOR);
    reader.open();
    reader.hasNext();
  }

  @Test(expected = UnknownFormatException.class)
  public void testUnknownFormat() throws IOException {
    final DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
        .schema(STRING_SCHEMA)
        .format(Accessor.getDefault().newFormat("explode!"))
        .build();

    MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
        fileSystem, Lists.newArrayList(TEST_FILE), descriptor);

    try {
      reader.open();
    } finally {
      reader.close();
    }
  }

  @Test(expected = DatasetReaderException.class)
  public void testMissingPath() throws IOException {
    Path missingFile = new Path("data/no-such-file.avro");

    /*
     * IMPORTANT: The DatasetReaderException should be thrown while iterating,
     * even though the first reader is the problem. This is because open()
     * should consistently validate the incoming files -- either fail when any
     * file is invalid or not check the validity of any files. Because we don't
     * want it to instantiate all FileSystemDatasetReaders in open(), this
     * verifies that the behavior is the latter case.
     */

    MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
        fileSystem, Lists.newArrayList(missingFile, TEST_FILE), DESCRIPTOR);

    try {
      try {
        reader.open();
      } catch (Throwable t) {
        Assert.fail("Reader failed in open: " + t.getClass().getName());
      }

      Assert.assertTrue("Reader is not open after open()", reader.isOpen());

      checkReaderIteration(reader, 200, VALIDATOR);

    } finally {
      reader.close();
    }
  }

  @Test(expected = DatasetReaderException.class)
  public void testEmptyFile() throws IOException {
    final Path emptyFile = new Path("/tmp/empty-file.avro");

    // outside the try block; if this fails then it isn't correct to remove it
    Assert.assertTrue("Failed to create a new empty file",
        fileSystem.createNewFile(emptyFile));

    /*
     * IMPORTANT: The DatasetReaderException should be thrown while iterating,
     * even though the first reader is the problem. This is because open()
     * should consistently validate the incoming files -- either fail when any
     * file is invalid or not check the validity of any files. Because we don't
     * want it to instantiate all FileSystemDatasetReaders in open(), this
     * verifies that the behavior is the latter case.
     */

    try {
      MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
          fileSystem, Lists.newArrayList(emptyFile, TEST_FILE), DESCRIPTOR);

      try {
        try {
          reader.open();
        } catch (Throwable t) {
          Assert.fail("Reader failed in open: " + t.getClass().getName());
        }

        Assert.assertTrue("Reader is not open after open()", reader.isOpen());

        // should fail in iteration
        checkReaderIteration(reader, 200, VALIDATOR);

      } finally {
        reader.close();
      }

    } finally {
      Assert.assertTrue("Failed to clean up empty file",
          fileSystem.delete(emptyFile, true));
    }
  }
}
TOP

Related Classes of com.cloudera.cdk.data.filesystem.TestMultiFileDatasetReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.