/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data.spi.filesystem;
import au.com.bytecode.opencsv.CSVWriter;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.reflect.ReflectData;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.kitesdk.compat.Hadoop;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetReaderException;
class CSVAppender<E> implements FileSystemWriter.FileAppender<E> {
private final CSVProperties props;
private final FileSystem fs;
private final Path path;
private final Schema schema;
// the actual writer
private FSDataOutputStream outgoing;
private CSVWriter writer;
public CSVAppender(FileSystem fs, Path path, DatasetDescriptor descriptor) {
this.fs = fs;
this.path = path;
this.schema = descriptor.getSchema();
Preconditions.checkState(schema.getType() == Schema.Type.RECORD,
"Unsupported schema (not a record): {}", schema);
this.props = CSVProperties.fromDescriptor(descriptor);
}
@Override
public void open() throws IOException {
this.outgoing = fs.create(path, true /* overwrite */);
this.writer = CSVUtil.newWriter(outgoing, props);
}
@Override
public void append(E entity) throws IOException {
writer.writeNext(shred(entity));
}
@Override
public void close() throws IOException {
writer.close();
outgoing.close();
}
@Override
public void cleanup() throws IOException {
// No cleanup tasks needed
}
@Override
public void flush() throws IOException {
writer.flush();
Hadoop.FSDataOutputStream.hflush.invoke(outgoing);
}
@Override
public void sync() throws IOException {
flush();
Hadoop.FSDataOutputStream.hsync.invoke(outgoing);
}
private String[] shred(E entity) {
if (entity instanceof IndexedRecord) {
return shredIndexed((IndexedRecord) entity, schema);
} else {
return shredReflect(entity, schema);
}
}
private static String[] shredIndexed(IndexedRecord entity, Schema schema) {
List<Schema.Field> fields = schema.getFields();
String[] values = new String[fields.size()];
for (int i = 0; i < values.length; i += 1) {
values[i] = valueString(entity.get(i), fields.get(i).schema());
}
return values;
}
private static <E> String[] shredReflect(E entity, Schema schema) {
ReflectData reflect = ReflectData.get();
List<Schema.Field> fields = schema.getFields();
String[] values = new String[fields.size()];
for (int i = 0; i < values.length; i += 1) {
Schema.Field field = fields.get(i);
values[i] = valueString(
reflect.getField(entity, field.name(), i), field.schema());
}
return values;
}
/**
* Returns a the value as the first matching schema type or null.
*
* Note that if the value may be null even if the schema does not allow the
* value to be null.
*
* @param value a value
* @param schema a Schema
* @return a String representation of the value according to the Schema type
*/
private static String valueString(Object value, Schema schema) {
if (value == null || schema.getType() == Schema.Type.NULL) {
return null;
}
switch (schema.getType()) {
case BOOLEAN:
case FLOAT:
case DOUBLE:
case INT:
case LONG:
case STRING:
return value.toString();
case ENUM:
// serialize as the ordinal from the schema
return String.valueOf(schema.getEnumOrdinal(value.toString()));
case UNION:
int index = ReflectData.get().resolveUnion(schema, value);
return valueString(value, schema.getTypes().get(index));
default:
// FIXED, BYTES, MAP, ARRAY, RECORD are not supported
throw new DatasetReaderException(
"Unsupported field type:" + schema.getType());
}
}
}