Package com.adgear.anoa.source.schemaless

Source Code of com.adgear.anoa.source.schemaless.CsvSource

package com.adgear.anoa.source.schemaless;

import com.adgear.anoa.provider.base.CounterlessProviderBase;
import com.adgear.anoa.source.Source;

import org.apache.avro.Schema;
import org.codehaus.jackson.node.JsonNodeFactory;
import org.supercsv.io.CsvListReader;
import org.supercsv.prefs.CsvPreference;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
* Iterates over comma-separated values, exposed as String lists.
*
* @see com.adgear.anoa.source.schemaless.CsvWithHeaderSource
*/
public class CsvSource
    extends CounterlessProviderBase<List<String>> implements Source<List<String>> {

  final protected CsvListReader csvListReader;
  final protected Schema avroSchema;
  protected List<String> buffer = null;

  public CsvSource(Reader in) {
    this(in, false);
  }

  protected CsvSource(Reader in, boolean hasHeader) {
    this(in, CsvPreference.STANDARD_PREFERENCE, hasHeader);
  }

  protected CsvSource(Reader in, CsvPreference preference, boolean withHeader) {
    csvListReader = new CsvListReader(in, preference);
    String[] header = null;
    if (withHeader) {
      try {
        header = csvListReader.getHeader(true);
      } catch (IOException e) {
        logger.error(e.getMessage());
      }
    }
    avroSchema = (withHeader && header != null) ? inferSchema(header) : null;
  }


  @Override
  protected List<String> getNext() throws IOException {
    List<String> rValue = (buffer == null) ? csvListReader.read() : buffer;
    buffer = null;
    return rValue;
  }

  @Override
  public boolean hasNext() {
    if (buffer != null) {
      return true;
    }
    try {
      buffer = csvListReader.read();
    } catch (IOException e) {
      logger.error(e.getMessage());
    }
    return (buffer != null);
  }

  @Override
  public void close() throws IOException {
    csvListReader.close();
  }

  private Schema inferSchema(String[] header) {
    ArrayList<Schema.Field> fields = new ArrayList<Schema.Field>();
    int i = 0;
    StringBuilder sb = new StringBuilder();
    for (String name : header) {
      Schema[] union = new Schema[2];
      union[0] = Schema.create(Schema.Type.NULL);
      union[1] = Schema.create(Schema.Type.STRING);
      Schema.Field field = new Schema.Field(name,
                                            Schema.createUnion(Arrays.asList(union)),
                                            "induced from column " + (++i),
                                            JsonNodeFactory.instance.nullNode());
      fields.add(field);
      sb.append(field.toString());
    }
    Schema schema = Utils.createSchema("csv", sb.toString());
    schema.setFields(fields);
    return schema;
  }
}
TOP

Related Classes of com.adgear.anoa.source.schemaless.CsvSource

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.