Package org.apache.trevni.avro

Source Code of org.apache.trevni.avro.AvroColumnReader$Params

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.trevni.avro;

import java.io.IOException;
import java.io.Closeable;
import java.io.File;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.Map;
import java.util.HashMap;
import java.util.List;

import org.apache.trevni.ColumnMetaData;
import org.apache.trevni.ColumnFileReader;
import org.apache.trevni.ColumnValues;
import org.apache.trevni.Input;
import org.apache.trevni.InputFile;
import org.apache.trevni.TrevniRuntimeException;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.generic.GenericData;

import static org.apache.trevni.avro.AvroColumnator.isSimple;

/** Read files written with {@link AvroColumnWriter}.  A subset of the schema
* used for writing may be specified when reading.  In this case only columns
* of the subset schema are read. */
public class AvroColumnReader<D>
  implements Iterator<D>, Iterable<D>, Closeable {

  private ColumnFileReader reader;
  private GenericData model;
  private Schema fileSchema;
  private Schema readSchema;
 
  private ColumnValues[] values;
  private int[] arrayWidths;
  private int column;                          // current index in values

  private Map<String,Map<String,Object>> defaults =
    new HashMap<String,Map<String,Object>>();

  /** Parameters for reading an Avro column file. */
  public static class Params {
    Input input;
    Schema schema;
    GenericData model = GenericData.get();

    /** Construct reading from a file. */
    public Params(File file) throws IOException {
      this(new InputFile(file));
    }

    /** Construct reading from input. */
    public Params(Input input) { this.input = input; }

    /** Set subset schema to project data down to. */
    public Params setSchema(Schema schema) {
      this.schema = schema;
      return this;
    }

    /** Set data representation. */
    public Params setModel(GenericData model) {
      this.model = model;
      return this;
    }
  }

  /** Construct a reader for a file. */
  public AvroColumnReader(Params params)
    throws IOException {
    this.reader = new ColumnFileReader(params.input);
    this.model = params.model;
    this.fileSchema =
      Schema.parse(reader.getMetaData().getString(AvroColumnWriter.SCHEMA_KEY));
    this.readSchema = params.schema == null ? fileSchema : params.schema;
    initialize();
  }

  /** Return the schema for data in this file. */
  public Schema getFileSchema() { return fileSchema; }

  void initialize() throws IOException {
    // compute a mapping from column name to number for file
    Map<String,Integer> fileColumnNumbers = new HashMap<String,Integer>();
    int i = 0;
    for (ColumnMetaData c : new AvroColumnator(fileSchema).getColumns())
      fileColumnNumbers.put(c.getName(), i++);

    // create iterator for each column in readSchema
    AvroColumnator readColumnator = new AvroColumnator(readSchema);
    this.arrayWidths = readColumnator.getArrayWidths();
    ColumnMetaData[] readColumns = readColumnator.getColumns();
    this.values = new ColumnValues[readColumns.length];
    int j = 0;
    for (ColumnMetaData c : readColumns) {
      Integer n = fileColumnNumbers.get(c.getName());
      if (n != null)
        values[j++] = reader.getValues(n);
    }
    findDefaults(readSchema, fileSchema);
  }

  // get defaults for fields in read that are not in write
  private void findDefaults(Schema read, Schema write) {
    switch (read.getType()) {
    case NULL: case BOOLEAN:
    case INT: case LONG:
    case FLOAT: case DOUBLE:
    case BYTES: case STRING:
    case ENUM: case FIXED:
      if (read.getType() != write.getType())
        throw new TrevniRuntimeException("Type mismatch: "+read+" & "+write);
      break;
    case MAP:
      findDefaults(read.getValueType(), write.getValueType());
      break;
    case ARRAY:
      findDefaults(read.getElementType(), write.getElementType());
      break;
    case UNION:
      for (Schema s : read.getTypes()) {
        Integer index = write.getIndexNamed(s.getFullName());
        if (index == null)
          throw new TrevniRuntimeException("No matching branch: "+s);
        findDefaults(s, write.getTypes().get(index));
      }
      break;
    case RECORD:
      for (Field f : read.getFields()) {
        Field g = write.getField(f.name());
        if (g == null)
          setDefault(read, f);
        else
          findDefaults(f.schema(), g.schema());
      }
      break;
    default:
      throw new TrevniRuntimeException("Unknown schema: "+read);
    }
  }

  private void setDefault(Schema record, Field f) {
    String recordName = record.getFullName();
    Map<String,Object> recordDefaults = defaults.get(recordName);
    if (recordDefaults == null) {
      recordDefaults = new HashMap<String,Object>();
      defaults.put(recordName, recordDefaults);
    }
    recordDefaults.put(f.name(), model.getDefaultValue(f));
  }

  @Override
  public Iterator<D> iterator() { return this; }

  @Override
  public boolean hasNext() {
    return values[0].hasNext();
  }

  /** Return the number of rows in this file. */
  public long getRowCount() { return reader.getRowCount(); }

  @Override
  public D next() {
    try {
      for (int i = 0; i < values.length; i++)
        if (values[i] != null)
          values[i].startRow();
      this.column = 0;
      return (D)read(readSchema);
    } catch (IOException e) {
      throw new TrevniRuntimeException(e);
    }
  }

  private Object read(Schema s) throws IOException {
    if (isSimple(s))
      return nextValue(s, column++);

    final int startColumn = column;

    switch (s.getType()) {
    case MAP:
      int size = values[column].nextLength();
      Map map = (Map)new HashMap(size);
      for (int i = 0; i < size; i++) {
        this.column = startColumn;
        values[column++].nextValue();                      // null in parent
        String key = (String)values[column++].nextValue(); // key
        map.put(key, read(s.getValueType()));              // value
      }
      column = startColumn + arrayWidths[startColumn];
      return map;
    case RECORD:
      Object record = model.newRecord(null, s);
      Map<String,Object> rDefaults = defaults.get(s.getFullName());
      for (Field f : s.getFields()) {
        Object value = ((rDefaults != null) && rDefaults.containsKey(f.name()))
          ? model.deepCopy(f.schema(), rDefaults.get(f.name()))
          : read(f.schema());
        model.setField(record, f.name(), f.pos(), value);
      }
      return record;
    case ARRAY:
      int length = values[column].nextLength();
      List elements = (List)new GenericData.Array(length, s);
      for (int i = 0; i < length; i++) {
        this.column = startColumn;
        Object value = nextValue(s, column++);
        if (!isSimple(s.getElementType()))
          value = read(s.getElementType());
        elements.add(value);
      }
      column = startColumn + arrayWidths[startColumn];
      return elements;
    case UNION:
      Object value = null;
      for (Schema branch : s.getTypes()) {
        if (branch.getType() == Schema.Type.NULL) continue;
        if (values[column].nextLength() == 1) {
          value = nextValue(branch, column);
          column++;
          if (!isSimple(branch))
            value = read(branch);
        } else {
          column += arrayWidths[column];
        }
      }
      return value;
    default:
      throw new TrevniRuntimeException("Unknown schema: "+s);
    }
  }

  private Object nextValue(Schema s, int column) throws IOException {
    Object v = values[column].nextValue();
   
    switch (s.getType()) {
    case ENUM:
      return model.createEnum(s.getEnumSymbols().get((Integer)v), s);
    case FIXED:
      return model.createFixed(null, ((ByteBuffer)v).array(), s);
    }

    return v;
  }

  @Override
  public void remove() { throw new UnsupportedOperationException(); }

  @Override
  public void close() throws IOException {
    reader.close();
  }

}
TOP

Related Classes of org.apache.trevni.avro.AvroColumnReader$Params

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.