Package com.cloudera.cdk.data.filesystem

Source Code of com.cloudera.cdk.data.filesystem.TestCSVFileReader

/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.cloudera.cdk.data.filesystem;

import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetReader;
import com.cloudera.cdk.data.TestDatasetReaders;
import com.cloudera.cdk.data.TestHelpers;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.generic.GenericData;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;

import java.io.IOException;

public class TestCSVFileReader extends TestDatasetReaders<GenericData.Record> {
  /*
   * OpenCSV notes:
   * - An empty unquoted field is passed as an empty string
   */

  public static final String CSV_CONTENT = (
      "str,34,2.11,false\r\n" +
      "\"str,2\",,4,true\n" +
      "str3,\"\",null");

  public static final String VALIDATOR_CSV_CONTENT =
      "id,string,even\n" +
          "0,a,true\n" +
          "1,b\n" +
          "2,c,true\n";

  public static final String TSV_CONTENT = (
      "string\tinteger\tfloat\tbool\r" +
      "str\t34\t2.11\tfalse\r\n" +
      "\"str\t2\"\t\t4\ttrue\n" +
      "str3\t\"\"\tnull");

  public static FileSystem localfs = null;
  public static Path csvFile = null;
  public static Path validatorFile = null;
  public static Path tsvFile = null;

  public static Schema STRINGS = SchemaBuilder.record("Strings")
      .fields()
      .name("string1").type().stringType().noDefault()
      .name("string2").type().stringType().noDefault()
      .name("string3").type().stringType().noDefault()
      .name("string4").type().stringType().stringDefault("missing value")
      .endRecord();

  public static final Schema VALIDATOR_SCHEMA = SchemaBuilder.record("Validator")
      .fields()
      .name("id").type().intType().noDefault()
      .name("string").type().stringType().noDefault()
      .name("even").type().booleanType().booleanDefault(false)
      .endRecord();

  public static Schema BEAN_SCHEMA = SchemaBuilder.record(TestBean.class.getName())
      .fields()
      .name("myString").type().stringType().noDefault()
      .name("myInt").type().intType().intDefault(0)
      .name("myFloat").type().floatType().noDefault()
      .name("myBool").type().booleanType().booleanDefault(false)
      .endRecord();

  public static Schema SCHEMA = SchemaBuilder.record("Normal")
      .fields()
      .name("myString").type().stringType().noDefault()
      .name("myInt").type().intType().intDefault(0)
      .name("myFloat").type().floatType().noDefault()
      .name("myBool").type().booleanType().booleanDefault(false)
      .endRecord();

  @BeforeClass
  public static void createCSVFiles() throws IOException {
    localfs = FileSystem.getLocal(new Configuration());
    csvFile = new Path("target/temp.csv");
    tsvFile = new Path("target/temp.tsv");
    validatorFile = new Path("target/validator.csv");

    FSDataOutputStream out = localfs.create(csvFile, true);
    out.writeBytes(CSV_CONTENT);
    out.close();

    out = localfs.create(validatorFile, true);
    out.writeBytes(VALIDATOR_CSV_CONTENT);
    out.close();

    out = localfs.create(tsvFile, true);
    out.writeBytes(TSV_CONTENT);
    out.close();
  }

  @Override
  public DatasetReader<GenericData.Record> newReader() throws IOException {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .property("cdk.csv.lines-to-skip", "1")
        .schema(VALIDATOR_SCHEMA)
        .build();
    return new CSVFileReader<GenericData.Record>(localfs, validatorFile, desc);
  }

  @Override
  public int getTotalRecords() {
    return 3;
  }

  @Override
  public DatasetTestUtilities.RecordValidator<GenericData.Record> getValidator() {
    return new DatasetTestUtilities.RecordValidator<GenericData.Record>() {
      private static final String chars = "abcdef";
      @Override
      public void validate(GenericData.Record record, int recordNum) {
        Assert.assertEquals(recordNum, record.get("id"));
        Assert.assertEquals(Character.toString(chars.charAt(recordNum)), record.get("string"));
        Assert.assertEquals((recordNum % 2) == 0, record.get("even"));
      }
    };
  }

  @Test(expected = IllegalArgumentException.class)
  public void testRejectsNonRecordSchemas() {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .schema(SchemaBuilder.array().items().stringType())
        .build();
    new CSVFileReader(localfs, csvFile, desc);
  }

  @Test
  public void testStringSchema() {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .schema(STRINGS)
        .build();
    final CSVFileReader<GenericData.Record> reader =
        new CSVFileReader<GenericData.Record>(localfs, csvFile, desc);

    reader.open();
    Assert.assertTrue(reader.hasNext());
    GenericData.Record rec = reader.next();
    Assert.assertEquals("str", rec.get(0));
    Assert.assertEquals("34", rec.get(1));
    Assert.assertEquals("2.11", rec.get(2));
    Assert.assertEquals("false", rec.get(3));

    Assert.assertTrue(reader.hasNext());
    rec = reader.next();
    Assert.assertEquals("str,2", rec.get(0));
    Assert.assertEquals("", rec.get(1));
    Assert.assertEquals("4", rec.get(2));
    Assert.assertEquals("true", rec.get(3));

    Assert.assertTrue(reader.hasNext());
    rec = reader.next();
    Assert.assertEquals("str3", rec.get(0));
    Assert.assertEquals("", rec.get(1));
    Assert.assertEquals("null", rec.get(2));
    Assert.assertEquals("missing value", rec.get(3));

    Assert.assertFalse(reader.hasNext());
  }

  @Test
  public void testTSV() {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .property("cdk.csv.delimiter", "\t")
        .property("cdk.csv.lines-to-skip", "1")
        .schema(STRINGS)
        .build();
    final CSVFileReader<GenericData.Record> reader =
        new CSVFileReader<GenericData.Record>(localfs, tsvFile, desc);

    reader.open();
    Assert.assertTrue(reader.hasNext());
    GenericData.Record rec = reader.next();
    Assert.assertEquals("str", rec.get(0));
    Assert.assertEquals("34", rec.get(1));
    Assert.assertEquals("2.11", rec.get(2));
    Assert.assertEquals("false", rec.get(3));

    Assert.assertTrue(reader.hasNext());
    rec = reader.next();
    Assert.assertEquals("str\t2", rec.get(0));
    Assert.assertEquals("", rec.get(1));
    Assert.assertEquals("4", rec.get(2));
    Assert.assertEquals("true", rec.get(3));

    Assert.assertTrue(reader.hasNext());
    rec = reader.next();
    Assert.assertEquals("str3", rec.get(0));
    Assert.assertEquals("", rec.get(1));
    Assert.assertEquals("null", rec.get(2));
    Assert.assertEquals("missing value", rec.get(3));

    Assert.assertFalse(reader.hasNext());
  }

  @Test
  public void testNormalSchema() {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .schema(SCHEMA)
        .build();
    final CSVFileReader<GenericData.Record> reader =
        new CSVFileReader<GenericData.Record>(localfs, csvFile, desc);

    reader.open();
    Assert.assertTrue(reader.hasNext());
    GenericData.Record rec = reader.next();
    Assert.assertEquals("str", rec.get(0));
    Assert.assertEquals(34, rec.get(1));
    Assert.assertEquals(2.11f, rec.get(2));
    Assert.assertEquals(false, rec.get(3));

    Assert.assertTrue(reader.hasNext());
    rec = reader.next();
    Assert.assertEquals("str,2", rec.get(0));
    Assert.assertEquals(0, rec.get(1));
    Assert.assertEquals(4.0f, rec.get(2));
    Assert.assertEquals(true, rec.get(3));

    Assert.assertTrue(reader.hasNext());
    TestHelpers.assertThrows("Should complain about missing default",
        AvroRuntimeException.class, new Runnable() {
      @Override
      public void run() {
        reader.next();
      }
    });

    Assert.assertFalse(reader.hasNext());
  }

  @Test
  public void testReflectedRecords() {
    final DatasetDescriptor desc = new DatasetDescriptor.Builder()
        .schema(BEAN_SCHEMA)
        .build();
    final CSVFileReader<TestBean> reader =
        new CSVFileReader<TestBean>(localfs, csvFile, desc);

    reader.open();
    Assert.assertTrue(reader.hasNext());
    TestBean bean = reader.next();
    Assert.assertEquals("str", bean.myStr);
    Assert.assertEquals((Integer) 34, bean.myInt);
    Assert.assertEquals((Float) 2.11f, bean.myFloat);
    Assert.assertEquals(false, bean.myBool);

    Assert.assertTrue(reader.hasNext());
    bean = reader.next();
    Assert.assertEquals("str,2", bean.myStr);
    Assert.assertEquals((Integer) 0, bean.myInt);
    Assert.assertEquals((Float) 4.0f, bean.myFloat);
    Assert.assertEquals(true, bean.myBool);

    Assert.assertTrue(reader.hasNext());
    TestHelpers.assertThrows("Should complain about missing default",
        AvroRuntimeException.class, new Runnable() {
      @Override
      public void run() {
        reader.next();
      }
    });

    Assert.assertFalse(reader.hasNext());
  }
}
TOP

Related Classes of com.cloudera.cdk.data.filesystem.TestCSVFileReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.