Package eu.stratosphere.api.java.record.io.avro

Source Code of eu.stratosphere.api.java.record.io.avro.AvroRecordInputFormat$IntMapValue

/***********************************************************************************************************************
*
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*
**********************************************************************************************************************/

package eu.stratosphere.api.java.record.io.avro;

import java.io.IOException;
import java.util.List;
import java.util.Map;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.FileReader;
import org.apache.avro.file.SeekableInput;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import eu.stratosphere.api.avro.FSDataInputStreamWrapper;
import eu.stratosphere.api.java.record.io.FileInputFormat;
import eu.stratosphere.core.fs.FileInputSplit;
import eu.stratosphere.core.fs.FileStatus;
import eu.stratosphere.core.fs.FileSystem;
import eu.stratosphere.core.fs.Path;
import eu.stratosphere.types.BooleanValue;
import eu.stratosphere.types.DoubleValue;
import eu.stratosphere.types.FloatValue;
import eu.stratosphere.types.IntValue;
import eu.stratosphere.types.ListValue;
import eu.stratosphere.types.LongValue;
import eu.stratosphere.types.MapValue;
import eu.stratosphere.types.NullValue;
import eu.stratosphere.types.Record;
import eu.stratosphere.types.StringValue;
import eu.stratosphere.types.Value;

/**
* Input format to read Avro files.
*
* The input format currently supports only flat avro schemas. So there is no
* support for complex types except for nullable primitve fields, e.g.
* ["string", null] (See
* http://avro.apache.org/docs/current/spec.html#schema_complex)
*
*/
public class AvroRecordInputFormat extends FileInputFormat {
  private static final long serialVersionUID = 1L;

  private static final Log LOG = LogFactory.getLog(AvroRecordInputFormat.class);

  private FileReader<GenericRecord> dataFileReader;
  private GenericRecord reuseAvroRecord = null;

  @Override
  public void open(FileInputSplit split) throws IOException {
    super.open(split);
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
    SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength());
    LOG.info("Opening split " + split);
    dataFileReader = DataFileReader.openReader(in, datumReader);
    dataFileReader.sync(split.getStart());
  }

  @Override
  public boolean reachedEnd() throws IOException {
    return !dataFileReader.hasNext();
  }

  @Override
  public Record nextRecord(Record record) throws IOException {
    if (!dataFileReader.hasNext()) {
      return null;
    }
    if (record == null) {
      throw new IllegalArgumentException("Empty PactRecord given");
    }
    reuseAvroRecord = dataFileReader.next(reuseAvroRecord);
    final List<Field> fields = reuseAvroRecord.getSchema().getFields();
    for (Field field : fields) {
      final Value value = convertAvroToPactValue(field, reuseAvroRecord.get(field.pos()));
      record.setField(field.pos(), value);
      record.updateBinaryRepresenation();
    }

    return record;
  }


  @SuppressWarnings("unchecked")
  private final Value convertAvroToPactValue(final Field field, final Object avroRecord) {
    if (avroRecord == null) {
      return null;
    }
    final Type type = checkTypeConstraintsAndGetType(field.schema());

    // check for complex types
    // (complex type FIXED is not yet supported)
    switch (type) {
      case ARRAY:
        final Type elementType = field.schema().getElementType().getType();
        final List<?> avroList = (List<?>) avroRecord;
        return convertAvroArrayToListValue(elementType, avroList);
      case ENUM:
        final List<String> symbols = field.schema().getEnumSymbols();
        final String avroRecordString = avroRecord.toString();
        if (!symbols.contains(avroRecordString)) {
          throw new RuntimeException("The given Avro file contains field with a invalid enum symbol");
        }
        sString.setValue(avroRecordString);
        return sString;
      case MAP:
        final Type valueType = field.schema().getValueType().getType();
        final Map<CharSequence, ?> avroMap = (Map<CharSequence, ?>) avroRecord;
        return convertAvroMapToMapValue(valueType, avroMap);
 
      // primitive type
      default:
        return convertAvroPrimitiveToValue(type, avroRecord);

    }
  }

  private final ListValue<?> convertAvroArrayToListValue(Type elementType, List<?> avroList) {
    switch (elementType) {
    case STRING:
      StringListValue sl = new StringListValue();
      for (Object item : avroList) {
        sl.add(new StringValue((CharSequence) item));
      }
      return sl;
    case INT:
      IntListValue il = new IntListValue();
      for (Object item : avroList) {
        il.add(new IntValue((Integer) item));
      }
      return il;
    case BOOLEAN:
      BooleanListValue bl = new BooleanListValue();
      for (Object item : avroList) {
        bl.add(new BooleanValue((Boolean) item));
      }
      return bl;
    case DOUBLE:
      DoubleListValue dl = new DoubleListValue();
      for (Object item : avroList) {
        dl.add(new DoubleValue((Double) item));
      }
      return dl;
    case FLOAT:
      FloatListValue fl = new FloatListValue();
      for (Object item : avroList) {
        fl.add(new FloatValue((Float) item));
      }
      return fl;
    case LONG:
      LongListValue ll = new LongListValue();
      for (Object item : avroList) {
        ll.add(new LongValue((Long) item));
      }
      return ll;
    default:
      throw new RuntimeException("Elements of type " + elementType + " are not supported for Avro arrays.");
    }
  }

  private final MapValue<StringValue, ?> convertAvroMapToMapValue(Type mapValueType, Map<CharSequence, ?> avroMap) {
    switch (mapValueType) {
    case STRING:
      StringMapValue sm = new StringMapValue();
      for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) {
        sm.put(new StringValue((CharSequence) entry.getKey()), new StringValue((String) entry.getValue()));
      }
      return sm;
    case INT:
      IntMapValue im = new IntMapValue();
      for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) {
        im.put(new StringValue((CharSequence) entry.getKey()), new IntValue((Integer) entry.getValue()));
      }
      return im;
    case BOOLEAN:
      BooleanMapValue bm = new BooleanMapValue();
      for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) {
        bm.put(new StringValue((CharSequence) entry.getKey()), new BooleanValue((Boolean) entry.getValue()));
      }
      return bm;
    case DOUBLE:
      DoubleMapValue dm = new DoubleMapValue();
      for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) {
        dm.put(new StringValue((CharSequence) entry.getKey()), new DoubleValue((Double) entry.getValue()));
      }
      return dm;
    case FLOAT:
      FloatMapValue fm = new FloatMapValue();
      for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) {
        fm.put(new StringValue((CharSequence) entry.getKey()), new FloatValue((Float) entry.getValue()));
      }
      return fm;
    case LONG:
      LongMapValue lm = new LongMapValue();
      for (Map.Entry<CharSequence, ?> entry : avroMap.entrySet()) {
        lm.put(new StringValue((CharSequence) entry.getKey()), new LongValue((Long) entry.getValue()));
      }
      return lm;

    default:
      throw new RuntimeException("Map values of type " + mapValueType + " are not supported for Avro map.");
    }
  }

  private StringValue sString = new StringValue();
  private IntValue sInt = new IntValue();
  private BooleanValue sBool = new BooleanValue();
  private DoubleValue sDouble = new DoubleValue();
  private FloatValue sFloat = new FloatValue();
  private LongValue sLong = new LongValue();
 
  private final Value convertAvroPrimitiveToValue(Type type, Object avroRecord) {
    switch (type) {
    case STRING:
      sString.setValue((CharSequence) avroRecord);
      return sString;
    case INT:
      sInt.setValue((Integer) avroRecord);
      return sInt;
    case BOOLEAN:
      sBool.setValue((Boolean) avroRecord);
      return sBool;
    case DOUBLE:
      sDouble.setValue((Double) avroRecord);
      return sDouble;
    case FLOAT:
      sFloat.setValue((Float) avroRecord);
      return sFloat;
    case LONG:
      sLong.setValue((Long) avroRecord);
      return sLong;
    case NULL:
      return NullValue.getInstance();
    default:
      throw new RuntimeException(
          "Type "
              + type
              + " for AvroInputFormat is not implemented. Open an issue on GitHub.");
    }
  }

  private final Type checkTypeConstraintsAndGetType(final Schema schema) {
    final Type type = schema.getType();
    if (type == Type.RECORD) {
      throw new RuntimeException("The given Avro file contains complex data types which are not supported right now");
    }

    if (type == Type.UNION) {
      List<Schema> types = schema.getTypes();
      if (types.size() > 2) {
        throw new RuntimeException("The given Avro file contains a union that has more than two elements");
      }
      if (types.size() == 1 && types.get(0).getType() != Type.UNION) {
        return types.get(0).getType();
      }
      if (types.get(0).getType() == Type.UNION || types.get(1).getType() == Type.UNION) {
        throw new RuntimeException("The given Avro file contains a nested union");
      }
      if (types.get(0).getType() == Type.NULL) {
        return types.get(1).getType();
      } else {
        if (types.get(1).getType() != Type.NULL) {
          throw new RuntimeException("The given Avro file is contains a union with two non-null types.");
        }
        return types.get(0).getType();
      }
    }
    return type;
  }

  /**
   * Set minNumSplits to number of files.
   */
  @Override
  public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    int numAvroFiles = 0;
    final Path path = this.filePath;
    // get all the files that are involved in the splits
    final FileSystem fs = path.getFileSystem();
    final FileStatus pathFile = fs.getFileStatus(path);

    if (!acceptFile(pathFile)) {
      throw new IOException("The given file does not pass the file-filter");
    }
    if (pathFile.isDir()) {
      // input is directory. list all contained files
      final FileStatus[] dir = fs.listStatus(path);
      for (int i = 0; i < dir.length; i++) {
        if (!dir[i].isDir() && acceptFile(dir[i])) {
          numAvroFiles++;
        }
      }
    } else {
      numAvroFiles = 1;
    }
    return super.createInputSplits(numAvroFiles);
  }

  // --------------------------------------------------------------------------------------------
  // Concrete subclasses of ListValue and MapValue for all possible primitive types
  // --------------------------------------------------------------------------------------------

  public static class StringListValue extends ListValue<StringValue> {
    private static final long serialVersionUID = 1L;
  }

  public static class IntListValue extends ListValue<IntValue> {
    private static final long serialVersionUID = 1L;
  }

  public static class BooleanListValue extends ListValue<BooleanValue> {
    private static final long serialVersionUID = 1L;
  }

  public static class DoubleListValue extends ListValue<DoubleValue> {
    private static final long serialVersionUID = 1L;
  }

  public static class FloatListValue extends ListValue<FloatValue> {
    private static final long serialVersionUID = 1L;
  }

  public static class LongListValue extends ListValue<LongValue> {
    private static final long serialVersionUID = 1L;
  }

  public static class StringMapValue extends MapValue<StringValue, StringValue> {
    private static final long serialVersionUID = 1L;
  }

  public static class IntMapValue extends MapValue<StringValue, IntValue> {
    private static final long serialVersionUID = 1L;
  }

  public static class BooleanMapValue extends MapValue<StringValue, BooleanValue> {
    private static final long serialVersionUID = 1L;
  }

  public static class DoubleMapValue extends MapValue<StringValue, DoubleValue> {
    private static final long serialVersionUID = 1L;
  }

  public static class FloatMapValue extends MapValue<StringValue, FloatValue> {
    private static final long serialVersionUID = 1L;
  }

  public static class LongMapValue extends MapValue<StringValue, LongValue> {
    private static final long serialVersionUID = 1L;
  }

}
TOP

Related Classes of eu.stratosphere.api.java.record.io.avro.AvroRecordInputFormat$IntMapValue

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.