Source Code of org.kitesdk.data.spi.filesystem.CSVUtil

/*
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.kitesdk.data.spi.filesystem;


import au.com.bytecode.opencsv.CSVReader;
import au.com.bytecode.opencsv.CSVWriter;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.avro.Schema;
import org.kitesdk.data.DatasetException;
import org.kitesdk.data.spi.Compatibility;


public class CSVUtil {


  public static CSVReader newReader(InputStream incoming, CSVProperties props) {
    return new CSVReader(
        new InputStreamReader(incoming, Charset.forName(props.charset)),
        props.delimiter.charAt(0), props.quote.charAt(0),
        props.escape.charAt(0), props.linesToSkip,
        false /* strict quotes off: don't ignore unquoted strings */,
        true /* ignore leading white-space */ );
  }


  public static CSVWriter newWriter(OutputStream outgoing, CSVProperties props) {
    return new CSVWriter(new OutputStreamWriter(
        outgoing, Charset.forName(props.charset)),
        props.delimiter.charAt(0), props.quote.charAt(0),
        props.escape.charAt(0));
  }


  private static final Pattern LONG = Pattern.compile("\\d+");
  private static final Pattern DOUBLE = Pattern.compile("\\d*\\.\\d*[dD]?");
  private static final Pattern FLOAT = Pattern.compile("\\d*\\.\\d*[fF]?");
  private static final int DEFAULT_INFER_LINES = 25;


  public static Schema inferNullableSchema(String name, InputStream incoming,
                                   CSVProperties props)
      throws IOException {
    return inferSchemaInternal(name, incoming, props, true);
  }


  public static Schema inferSchema(String name, InputStream incoming,
                                           CSVProperties props)
      throws IOException {
    return inferSchemaInternal(name, incoming, props, false);
  }


  private static Schema inferSchemaInternal(String name, InputStream incoming,
                                   CSVProperties props, boolean makeNullable)
      throws IOException {
    CSVReader reader = newReader(incoming, props);


    String[] header;
    String[] line;
    if (props.useHeader) {
      // read the header and then the first line
      header = reader.readNext();
      checkHeader(header);
      line = reader.readNext();
      Preconditions.checkNotNull(line, "No content to infer schema");


    } else {
      // use the first line to create a header
      line = reader.readNext();
      Preconditions.checkNotNull(line, "No content to infer schema");
      header = new String[line.length];
      for (int i = 0; i < line.length; i += 1) {
        header[i] = "field_" + String.valueOf(i);
      }
    }


    Schema.Type[] types = new Schema.Type[header.length];
    String[] values = new String[header.length];
    boolean[] nullable = new boolean[header.length];


    for (int processed = 0; processed < DEFAULT_INFER_LINES; processed += 1) {
      for (int i = 0; i < header.length; i += 1) {
        if (types[i] == null) {
          types[i] = inferFieldType(line[i]);
          if (types[i] == null) {
            nullable[i] = true;
          } else {
            // keep track of the value used
            values[i] = line[i];
          }
        } else if (line[i] == null || line[i].isEmpty()) {
          nullable[i] = true;
        }
      }
      line = reader.readNext();
      if (line == null) {
        break;
      }
    }


    // types may be missing, but fieldSchema will return a nullable string
    List<Schema.Field> fields = Lists.newArrayList();
    for (int i = 0; i < header.length; i += 1) {
      if (header[i] == null) {
        throw new DatasetException("Bad header for field " + i + ": null");
      } else if (header[i].trim().isEmpty()) {
        throw new DatasetException(
            "Bad header for field " + i + ": \"" + header[i] + "\"");
      } else if(!Compatibility.isAvroCompatibleName(header[i].trim())) {
        throw new DatasetException(
              "Bad header for field, should start with a character " +
              "or _ and can contain only alphanumerics and _ " +
              i + ": \"" + header[i] + "\"");
      }
      fields.add(new Schema.Field(
          header[i].trim(), schema(types[i], makeNullable || nullable[i]),
          "Type inferred from '" + String.valueOf(values[i]) + "'", null));
    }


    Schema record = Schema.createRecord(
        name, "Schema generated by Kite", null, false);
    record.setFields(fields);
    return record;
  }


  /**
   * Create a {@link Schema} for the given type. If the type is null,
   * the schema will be a nullable String. If isNullable is true, the returned
   * schema will be nullable.
   *
   * @param type a {@link Schema.Type} compatible with {@code Schema.create}
   * @param makeNullable If {@code true}, the return type will be nullable
   * @return a {@code Schema} for the given {@code Schema.Type}
   * @see Schema#create(org.apache.avro.Schema.Type)
   */
  private static Schema schema(Schema.Type type, boolean makeNullable) {
    Schema schema = Schema.create(type == null ? Schema.Type.STRING : type);
    if (makeNullable || type == null) {
      schema = Schema.createUnion(Lists.newArrayList(
          Schema.create(Schema.Type.NULL), schema));
    }
    return schema;
  }


  private static Schema.Type inferFieldType(String example) {
    if (example == null || example.isEmpty()) {
      return null; // not enough information
    } else if (LONG.matcher(example).matches()) {
      return Schema.Type.LONG;
    } else if (DOUBLE.matcher(example).matches()) {
      return Schema.Type.DOUBLE;
    } else if (FLOAT.matcher(example).matches()) {
      return Schema.Type.FLOAT;
    }
    return Schema.Type.STRING;
  }


  private static void checkHeader(String[] header) {
    Preconditions.checkNotNull(header, "No header content");
    for (int i = 0; i < header.length; i += 1) {
      if (header[i] == null) {
        throw new RuntimeException("Bad header for field " + i + ": null");
      } else if (header[i].trim().isEmpty()) {
        throw new RuntimeException(
            "Bad header for field " + i + ": \"" + header[i] + "\"");
      }
    }
  }
}
Source Code of org.kitesdk.data.spi.filesystem.CSVUtil

Related Classes of org.kitesdk.data.spi.filesystem.CSVUtil