/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.data.filesystem;
import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetReader;
import com.cloudera.cdk.data.TestDatasetReaders;
import com.cloudera.cdk.data.TestHelpers;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.generic.GenericData;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.IOException;
public class TestCSVFileReader extends TestDatasetReaders<GenericData.Record> {
/*
* OpenCSV notes:
* - An empty unquoted field is passed as an empty string
*/
public static final String CSV_CONTENT = (
"str,34,2.11,false\r\n" +
"\"str,2\",,4,true\n" +
"str3,\"\",null");
public static final String VALIDATOR_CSV_CONTENT =
"id,string,even\n" +
"0,a,true\n" +
"1,b\n" +
"2,c,true\n";
public static final String TSV_CONTENT = (
"string\tinteger\tfloat\tbool\r" +
"str\t34\t2.11\tfalse\r\n" +
"\"str\t2\"\t\t4\ttrue\n" +
"str3\t\"\"\tnull");
public static FileSystem localfs = null;
public static Path csvFile = null;
public static Path validatorFile = null;
public static Path tsvFile = null;
public static Schema STRINGS = SchemaBuilder.record("Strings")
.fields()
.name("string1").type().stringType().noDefault()
.name("string2").type().stringType().noDefault()
.name("string3").type().stringType().noDefault()
.name("string4").type().stringType().stringDefault("missing value")
.endRecord();
public static final Schema VALIDATOR_SCHEMA = SchemaBuilder.record("Validator")
.fields()
.name("id").type().intType().noDefault()
.name("string").type().stringType().noDefault()
.name("even").type().booleanType().booleanDefault(false)
.endRecord();
public static Schema BEAN_SCHEMA = SchemaBuilder.record(TestBean.class.getName())
.fields()
.name("myString").type().stringType().noDefault()
.name("myInt").type().intType().intDefault(0)
.name("myFloat").type().floatType().noDefault()
.name("myBool").type().booleanType().booleanDefault(false)
.endRecord();
public static Schema SCHEMA = SchemaBuilder.record("Normal")
.fields()
.name("myString").type().stringType().noDefault()
.name("myInt").type().intType().intDefault(0)
.name("myFloat").type().floatType().noDefault()
.name("myBool").type().booleanType().booleanDefault(false)
.endRecord();
@BeforeClass
public static void createCSVFiles() throws IOException {
localfs = FileSystem.getLocal(new Configuration());
csvFile = new Path("target/temp.csv");
tsvFile = new Path("target/temp.tsv");
validatorFile = new Path("target/validator.csv");
FSDataOutputStream out = localfs.create(csvFile, true);
out.writeBytes(CSV_CONTENT);
out.close();
out = localfs.create(validatorFile, true);
out.writeBytes(VALIDATOR_CSV_CONTENT);
out.close();
out = localfs.create(tsvFile, true);
out.writeBytes(TSV_CONTENT);
out.close();
}
@Override
public DatasetReader<GenericData.Record> newReader() throws IOException {
final DatasetDescriptor desc = new DatasetDescriptor.Builder()
.property("cdk.csv.lines-to-skip", "1")
.schema(VALIDATOR_SCHEMA)
.build();
return new CSVFileReader<GenericData.Record>(localfs, validatorFile, desc);
}
@Override
public int getTotalRecords() {
return 3;
}
@Override
public DatasetTestUtilities.RecordValidator<GenericData.Record> getValidator() {
return new DatasetTestUtilities.RecordValidator<GenericData.Record>() {
private static final String chars = "abcdef";
@Override
public void validate(GenericData.Record record, int recordNum) {
Assert.assertEquals(recordNum, record.get("id"));
Assert.assertEquals(Character.toString(chars.charAt(recordNum)), record.get("string"));
Assert.assertEquals((recordNum % 2) == 0, record.get("even"));
}
};
}
@Test(expected = IllegalArgumentException.class)
public void testRejectsNonRecordSchemas() {
final DatasetDescriptor desc = new DatasetDescriptor.Builder()
.schema(SchemaBuilder.array().items().stringType())
.build();
new CSVFileReader(localfs, csvFile, desc);
}
@Test
public void testStringSchema() {
final DatasetDescriptor desc = new DatasetDescriptor.Builder()
.schema(STRINGS)
.build();
final CSVFileReader<GenericData.Record> reader =
new CSVFileReader<GenericData.Record>(localfs, csvFile, desc);
reader.open();
Assert.assertTrue(reader.hasNext());
GenericData.Record rec = reader.next();
Assert.assertEquals("str", rec.get(0));
Assert.assertEquals("34", rec.get(1));
Assert.assertEquals("2.11", rec.get(2));
Assert.assertEquals("false", rec.get(3));
Assert.assertTrue(reader.hasNext());
rec = reader.next();
Assert.assertEquals("str,2", rec.get(0));
Assert.assertEquals("", rec.get(1));
Assert.assertEquals("4", rec.get(2));
Assert.assertEquals("true", rec.get(3));
Assert.assertTrue(reader.hasNext());
rec = reader.next();
Assert.assertEquals("str3", rec.get(0));
Assert.assertEquals("", rec.get(1));
Assert.assertEquals("null", rec.get(2));
Assert.assertEquals("missing value", rec.get(3));
Assert.assertFalse(reader.hasNext());
}
@Test
public void testTSV() {
final DatasetDescriptor desc = new DatasetDescriptor.Builder()
.property("cdk.csv.delimiter", "\t")
.property("cdk.csv.lines-to-skip", "1")
.schema(STRINGS)
.build();
final CSVFileReader<GenericData.Record> reader =
new CSVFileReader<GenericData.Record>(localfs, tsvFile, desc);
reader.open();
Assert.assertTrue(reader.hasNext());
GenericData.Record rec = reader.next();
Assert.assertEquals("str", rec.get(0));
Assert.assertEquals("34", rec.get(1));
Assert.assertEquals("2.11", rec.get(2));
Assert.assertEquals("false", rec.get(3));
Assert.assertTrue(reader.hasNext());
rec = reader.next();
Assert.assertEquals("str\t2", rec.get(0));
Assert.assertEquals("", rec.get(1));
Assert.assertEquals("4", rec.get(2));
Assert.assertEquals("true", rec.get(3));
Assert.assertTrue(reader.hasNext());
rec = reader.next();
Assert.assertEquals("str3", rec.get(0));
Assert.assertEquals("", rec.get(1));
Assert.assertEquals("null", rec.get(2));
Assert.assertEquals("missing value", rec.get(3));
Assert.assertFalse(reader.hasNext());
}
@Test
public void testNormalSchema() {
final DatasetDescriptor desc = new DatasetDescriptor.Builder()
.schema(SCHEMA)
.build();
final CSVFileReader<GenericData.Record> reader =
new CSVFileReader<GenericData.Record>(localfs, csvFile, desc);
reader.open();
Assert.assertTrue(reader.hasNext());
GenericData.Record rec = reader.next();
Assert.assertEquals("str", rec.get(0));
Assert.assertEquals(34, rec.get(1));
Assert.assertEquals(2.11f, rec.get(2));
Assert.assertEquals(false, rec.get(3));
Assert.assertTrue(reader.hasNext());
rec = reader.next();
Assert.assertEquals("str,2", rec.get(0));
Assert.assertEquals(0, rec.get(1));
Assert.assertEquals(4.0f, rec.get(2));
Assert.assertEquals(true, rec.get(3));
Assert.assertTrue(reader.hasNext());
TestHelpers.assertThrows("Should complain about missing default",
AvroRuntimeException.class, new Runnable() {
@Override
public void run() {
reader.next();
}
});
Assert.assertFalse(reader.hasNext());
}
@Test
public void testReflectedRecords() {
final DatasetDescriptor desc = new DatasetDescriptor.Builder()
.schema(BEAN_SCHEMA)
.build();
final CSVFileReader<TestBean> reader =
new CSVFileReader<TestBean>(localfs, csvFile, desc);
reader.open();
Assert.assertTrue(reader.hasNext());
TestBean bean = reader.next();
Assert.assertEquals("str", bean.myStr);
Assert.assertEquals((Integer) 34, bean.myInt);
Assert.assertEquals((Float) 2.11f, bean.myFloat);
Assert.assertEquals(false, bean.myBool);
Assert.assertTrue(reader.hasNext());
bean = reader.next();
Assert.assertEquals("str,2", bean.myStr);
Assert.assertEquals((Integer) 0, bean.myInt);
Assert.assertEquals((Float) 4.0f, bean.myFloat);
Assert.assertEquals(true, bean.myBool);
Assert.assertTrue(reader.hasNext());
TestHelpers.assertThrows("Should complain about missing default",
AvroRuntimeException.class, new Runnable() {
@Override
public void run() {
reader.next();
}
});
Assert.assertFalse(reader.hasNext());
}
}