Package com.cloudera.cdk.data.filesystem

Source Code of com.cloudera.cdk.data.filesystem.CSVFileReader

/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.cloudera.cdk.data.filesystem;

import au.com.bytecode.opencsv.CSVReader;
import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetReaderException;
import com.cloudera.cdk.data.spi.AbstractDatasetReader;
import com.cloudera.cdk.data.spi.ReaderWriterState;
import com.google.common.base.Preconditions;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.reflect.ReflectData;
import org.apache.avro.specific.SpecificData;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.beans.IntrospectionException;
import java.beans.PropertyDescriptor;
import java.io.IOException;
import java.io.InputStreamReader;
import java.lang.reflect.InvocationTargetException;
import java.nio.charset.Charset;
import java.util.Locale;
import java.util.NoSuchElementException;

class CSVFileReader<E> extends AbstractDatasetReader<E> {

  private static final Logger logger = LoggerFactory
      .getLogger(CSVFileReader.class);

  public static final String CHARSET_PROPERTY = "cdk.csv.charset";
  public static final String DELIMITER_PROPERTY = "cdk.csv.delimiter";
  public static final String QUOTE_CHAR_PROPERTY = "cdk.csv.quote-char";
  public static final String ESCAPE_CHAR_PROPERTY = "cdk.csv.escape-char";
  public static final String LINES_TO_SKIP_PROPERTY = "cdk.csv.lines-to-skip";

  public static final String DEFAULT_CHARSET = "utf8";
  public static final String DEFAULT_DELIMITER = ",";
  public static final String DEFAULT_QUOTE = "\"";
  public static final String DEFAULT_ESCAPE = "\\";
  public static final int DEFAULT_LINES_TO_SKIP = 0;

  private final FileSystem fs;
  private final Path path;
  private final Schema schema;

  // configuration
  private String charset = DEFAULT_CHARSET;
  private String delimiter = DEFAULT_DELIMITER;
  private String quote = DEFAULT_QUOTE;
  private String escape = DEFAULT_ESCAPE;
  private int linesToSkip = DEFAULT_LINES_TO_SKIP;
  private Class<E> recordClass = null;

  private CSVReader reader = null;

  // state
  private ReaderWriterState state = ReaderWriterState.NEW;
  private boolean hasNext = false;
  private String[] next = null;

  public CSVFileReader(FileSystem fileSystem, Path path, DatasetDescriptor descriptor) {
    this.fs = fileSystem;
    this.path = path;
    this.schema = descriptor.getSchema();
    this.state = ReaderWriterState.NEW;

    Schema schema = descriptor.getSchema();
    Preconditions.checkArgument(Schema.Type.RECORD.equals(schema.getType()),
        "Schemas for CSV files must be records of primitive types");

    String charset = descriptor.getProperty(CHARSET_PROPERTY);
    if (charset != null) {
      this.charset = charset;
    }
    String delimiter = descriptor.getProperty(DELIMITER_PROPERTY);
    if (delimiter != null) {
      this.delimiter = delimiter;
    }
    String quote = descriptor.getProperty(QUOTE_CHAR_PROPERTY);
    if (quote != null) {
      this.quote = quote;
    }
    String escape = descriptor.getProperty(ESCAPE_CHAR_PROPERTY);
    if (escape != null) {
      this.escape = escape;
    }
    String linesToSkip = descriptor.getProperty(LINES_TO_SKIP_PROPERTY);
    if (linesToSkip != null) {
      try {
        this.linesToSkip = Integer.valueOf(linesToSkip);
      } catch (NumberFormatException ex) {
        logger.debug("Defaulting lines to skip, failed to parse: {}",
            linesToSkip);
        // linesToSkip remains set to the default
      }
    }
  }

  @Override
  @SuppressWarnings("unchecked")
  public void open() {
    Preconditions.checkState(state.equals(ReaderWriterState.NEW),
        "A reader may not be opened more than once - current state:%s", state);

    // may be null if not using specific records
    this.recordClass = SpecificData.get().getClass(schema);

    FSDataInputStream incoming;
    try {
      incoming =  fs.open(path);
    } catch (IOException ex) {
      throw new DatasetReaderException("Cannot open path: " + path, ex);
    }

    this.reader = new CSVReader(
        new InputStreamReader(incoming, Charset.forName(charset)),
        delimiter.charAt(0), quote.charAt(0), escape.charAt(0), linesToSkip,
        false /* strict quotes off: don't ignore unquoted strings */,
        true /* ignore leading white-space */ );

    // initialize by reading the first record
    this.hasNext = advance();

    this.state = ReaderWriterState.OPEN;
  }

  @Override
  public boolean hasNext() {
    Preconditions.checkState(state.equals(ReaderWriterState.OPEN),
        "Attempt to read from a file in state:%s", state);
    return hasNext;
  }

  @Override
  public E next() {
    Preconditions.checkState(state.equals(ReaderWriterState.OPEN),
        "Attempt to read from a file in state:%s", state);

    if (!hasNext) {
      throw new NoSuchElementException();
    }

    try {
      E record = makeRecord();
      return record;
    } finally {
      this.hasNext = advance();
    }
  }

  private boolean advance() {
    try {
      next = reader.readNext();
    } catch (IOException ex) {
      throw new DatasetReaderException("Could not read record", ex);
    }
    return (next != null);
  }

  @Override
  public void close() {
    if (!state.equals(ReaderWriterState.OPEN)) {
      return;
    }

    logger.debug("Closing reader on path:{}", path);

    try {
      reader.close();
    } catch (IOException e) {
      throw new DatasetReaderException("Unable to close reader path:" + path, e);
    }

    state = ReaderWriterState.CLOSED;
  }

  @Override
  public boolean isOpen() {
    return (this.state == ReaderWriterState.OPEN);
  }

  private E makeRecord() {
    if (recordClass != null) {
      E record = makeReflectRecord();
      if (record != null) {
        return record;
      }
    }
    return makeGenericRecord();
  }

  @SuppressWarnings("unchecked")
  private E makeGenericRecord() {
    GenericRecord record = new GenericData.Record(schema);
    fillIndexed(record, next);
    return (E) record;
  }

  @SuppressWarnings("unchecked")
  private E makeReflectRecord() {
    E record = (E) ReflectData.get().newInstance(recordClass, schema);
    if (record instanceof IndexedRecord) {
      fillIndexed((IndexedRecord) record, next);
    } else {
      fillReflect(record, next, schema);
    }
    return record;
  }

  private static void fillIndexed(IndexedRecord record, String[] data) {
    Schema schema = record.getSchema();
    for (int i = 0, n = schema.getFields().size(); i < n; i += 1) {
      final Schema.Field field = schema.getFields().get(i);
      if (i < data.length) {
        record.put(i, makeValue(data[i], field));
      } else {
        record.put(i, makeValue(null, field));
      }
    }
  }

  private static void fillReflect(Object record, String[] data, Schema schema) {
    for (int i = 0, n = schema.getFields().size(); i < n; i += 1) {
      final Schema.Field field = schema.getFields().get(i);
      final Object value = makeValue(i < data.length ? data[i] : null, field);
      try {
        final PropertyDescriptor propertyDescriptor = new PropertyDescriptor(
                field.name(), record.getClass(), null, setter(field.name()));
        propertyDescriptor.getWriteMethod().invoke(record, value);
      } catch (IntrospectionException ex) {
        throw new IllegalStateException("Cannot set property " + field.name() +
            " on " + record.getClass().getName(), ex);
      } catch (InvocationTargetException ex) {
        throw new IllegalStateException("Cannot set property " + field.name() +
            " on " + record.getClass().getName(), ex);
      } catch (IllegalAccessException ex) {
        throw new IllegalStateException("Cannot set property " + field.name() +
            " on " + record.getClass().getName(), ex);
      }
    }
  }

  private static String setter(String name) {
    return "set" +
        name.substring(0, 1).toUpperCase(Locale.ENGLISH) +
        name.substring(1);
  }

  private static Object makeValue(String string, Schema.Field field) {
    Object value = makeValue(string, field.schema());
    if (value != null || nullOk(field.schema())) {
      return value;
    } else {
      // this will fail if there is no default value
      return ReflectData.get().getDefaultValue(field);
    }
  }

  /**
   * Returns a the value as the first matching schema type or null.
   *
   * Note that if the value may be null even if the schema does not allow the
   * value to be null.
   *
   * @param string a String representation of the value
   * @param schema a Schema
   * @return the string coerced to the correct type from the schema or null
   */
  private static Object makeValue(String string, Schema schema) {
    if (string == null) {
      return null;
    }

    try {
      switch (schema.getType()) {
        case BOOLEAN:
          return Boolean.valueOf(string);
        case STRING:
          return string;
        case FLOAT:
          return Float.valueOf(string);
        case DOUBLE:
          return Double.valueOf(string);
        case INT:
          return Integer.valueOf(string);
        case LONG:
          return Long.valueOf(string);
        case ENUM:
          // TODO: translate to enum class
          if (schema.hasEnumSymbol(string)) {
            return string;
          } else {
            try {
              return schema.getEnumSymbols().get(Integer.valueOf(string));
            } catch (IndexOutOfBoundsException ex) {
              return null;
            }
          }
        case UNION:
          Object value = null;
          for (Schema possible : schema.getTypes()) {
            value = makeValue(string, possible);
            if (value != null) {
              return value;
            }
          }
          return null;
        default:
          // FIXED, BYTES, MAP, ARRAY, RECORD are not supported
          throw new DatasetReaderException(
              "Unsupported field type:" + schema.getType());
      }
    } catch (NumberFormatException ex) {
      return null;
    }
  }

  /**
   * Returns whether null is allowed by the schema.
   *
   * @param schema a Schema
   * @return true if schema allows the value to be null
   */
  private static boolean nullOk(Schema schema) {
    if (Schema.Type.NULL == schema.getType()) {
      return true;
    } else if (Schema.Type.UNION == schema.getType()) {
      for (Schema possible : schema.getTypes()) {
        if (nullOk(possible)) {
          return true;
        }
      }
    }
    return false;
  }
}
TOP

Related Classes of com.cloudera.cdk.data.filesystem.CSVFileReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.