/**
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.data.filesystem;
import com.cloudera.cdk.data.TestDatasetReaders;
import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetReader;
import com.cloudera.cdk.data.DatasetReaderException;
import com.cloudera.cdk.data.UnknownFormatException;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
import java.io.IOException;
import org.apache.avro.generic.GenericData.Record;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import static com.cloudera.cdk.data.filesystem.DatasetTestUtilities.*;
import com.cloudera.cdk.data.impl.Accessor;
import org.apache.avro.generic.GenericData;
public class TestMultiFileDatasetReader extends TestDatasetReaders {
public static final Path TEST_FILE = new Path(
Resources.getResource("data/strings-100.avro").getFile());
public static final RecordValidator<Record> VALIDATOR =
new RecordValidator<Record>() {
@Override
public void validate(Record record, int recordNum) {
Assert.assertNotNull(record);
Assert.assertEquals(String.valueOf(recordNum % 100), record.get("text"));
}
};
public static final DatasetDescriptor DESCRIPTOR = new DatasetDescriptor
.Builder().schema(STRING_SCHEMA).build();
@Override
public DatasetReader newReader() throws IOException {
return new MultiFileDatasetReader<GenericData.Record>(
FileSystem.get(new Configuration()),
Lists.newArrayList(TEST_FILE, TEST_FILE),
DESCRIPTOR);
}
@Override
public int getTotalRecords() {
return 200;
}
@Override
public DatasetTestUtilities.RecordValidator getValidator() {
return VALIDATOR;
}
private FileSystem fileSystem;
@Before
public void setUp() throws IOException {
this.fileSystem = FileSystem.get(new Configuration());
}
@Test
public void testEmptyPathList() throws IOException {
MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
fileSystem, Lists.<Path>newArrayList(), DESCRIPTOR);
checkReaderBehavior(reader, 0, VALIDATOR);
}
@Test
public void testSingleFile() throws IOException {
MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
fileSystem, Lists.newArrayList(TEST_FILE), DESCRIPTOR);
checkReaderBehavior(reader, 100, VALIDATOR);
}
@Test(expected = IllegalArgumentException.class)
public void testRequriesFileSystem() throws IOException {
MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
null, Lists.newArrayList(TEST_FILE, TEST_FILE), DESCRIPTOR);
}
@Test(expected = IllegalArgumentException.class)
public void testRequriesFiles() throws IOException {
MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
fileSystem, null, DESCRIPTOR);
}
@Test(expected = IllegalArgumentException.class)
public void testRequriesDescriptor() throws IOException {
MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
fileSystem, Lists.newArrayList(TEST_FILE, TEST_FILE), null);
}
@Test(expected = IllegalArgumentException.class)
public void testRejectsNullPaths() throws IOException {
MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
fileSystem, Lists.newArrayList(null, TEST_FILE), DESCRIPTOR);
reader.open();
reader.hasNext();
}
@Test(expected = UnknownFormatException.class)
public void testUnknownFormat() throws IOException {
final DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
.schema(STRING_SCHEMA)
.format(Accessor.getDefault().newFormat("explode!"))
.build();
MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
fileSystem, Lists.newArrayList(TEST_FILE), descriptor);
try {
reader.open();
} finally {
reader.close();
}
}
@Test(expected = DatasetReaderException.class)
public void testMissingPath() throws IOException {
Path missingFile = new Path("data/no-such-file.avro");
/*
* IMPORTANT: The DatasetReaderException should be thrown while iterating,
* even though the first reader is the problem. This is because open()
* should consistently validate the incoming files -- either fail when any
* file is invalid or not check the validity of any files. Because we don't
* want it to instantiate all FileSystemDatasetReaders in open(), this
* verifies that the behavior is the latter case.
*/
MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
fileSystem, Lists.newArrayList(missingFile, TEST_FILE), DESCRIPTOR);
try {
try {
reader.open();
} catch (Throwable t) {
Assert.fail("Reader failed in open: " + t.getClass().getName());
}
Assert.assertTrue("Reader is not open after open()", reader.isOpen());
checkReaderIteration(reader, 200, VALIDATOR);
} finally {
reader.close();
}
}
@Test(expected = DatasetReaderException.class)
public void testEmptyFile() throws IOException {
final Path emptyFile = new Path("/tmp/empty-file.avro");
// outside the try block; if this fails then it isn't correct to remove it
Assert.assertTrue("Failed to create a new empty file",
fileSystem.createNewFile(emptyFile));
/*
* IMPORTANT: The DatasetReaderException should be thrown while iterating,
* even though the first reader is the problem. This is because open()
* should consistently validate the incoming files -- either fail when any
* file is invalid or not check the validity of any files. Because we don't
* want it to instantiate all FileSystemDatasetReaders in open(), this
* verifies that the behavior is the latter case.
*/
try {
MultiFileDatasetReader<Record> reader = new MultiFileDatasetReader<Record>(
fileSystem, Lists.newArrayList(emptyFile, TEST_FILE), DESCRIPTOR);
try {
try {
reader.open();
} catch (Throwable t) {
Assert.fail("Reader failed in open: " + t.getClass().getName());
}
Assert.assertTrue("Reader is not open after open()", reader.isOpen());
// should fail in iteration
checkReaderIteration(reader, 200, VALIDATOR);
} finally {
reader.close();
}
} finally {
Assert.assertTrue("Failed to clean up empty file",
fileSystem.delete(emptyFile, true));
}
}
}