Package com.cloudera.cdk.morphline.avro

Source Code of com.cloudera.cdk.morphline.avro.ToAvroBuilder

/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.morphline.avro;

import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;

import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Parser;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.reflect.ReflectData;

import com.cloudera.cdk.morphline.api.Command;
import com.cloudera.cdk.morphline.api.CommandBuilder;
import com.cloudera.cdk.morphline.api.MorphlineCompilationException;
import com.cloudera.cdk.morphline.api.MorphlineContext;
import com.cloudera.cdk.morphline.api.MorphlineRuntimeException;
import com.cloudera.cdk.morphline.api.Record;
import com.cloudera.cdk.morphline.base.AbstractCommand;
import com.cloudera.cdk.morphline.base.Configs;
import com.cloudera.cdk.morphline.base.Fields;
import com.cloudera.cdk.morphline.stdio.AbstractParser;
import com.google.common.base.Preconditions;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;


/**
* Command that converts a morphline record to an Avro record.
*
* @since 0.9.0
*/
public final class ToAvroBuilder implements CommandBuilder {
 
  @Override
  public Collection<String> getNames() {
    return Collections.singletonList("toAvro");
  }
 
  @Override
  public Command build(Config config, Command parent, Command child, MorphlineContext context) {
    return new ToAvro(this, config, parent, child, context);
  }
 
 
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  private static final class ToAvro extends AbstractCommand {
   
    private final Map<String, String> mappings = new HashMap();
    private final Schema fixedSchema;
    private final String schemaField;
   
    // more efficient than raising & catching exceptions
    private static final Object ERROR = new Object();
   
    public ToAvro(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
      super(builder, config, parent, child, context);
     
      String schemaFile = getConfigs().getString(config, "schemaFile", null);
      String schemaString = getConfigs().getString(config, "schemaString", null);
      this.schemaField = getConfigs().getString(config, "schemaField", null);
     
      int numDefinitions = 0;
      if (schemaFile != null) {
        numDefinitions++;
      }
      if (schemaString != null) {
        numDefinitions++;
      }
      if (schemaField != null) {
        numDefinitions++;
      }
      if (numDefinitions == 0) {
        throw new MorphlineCompilationException(
          "Either schemaFile or schemaString or schemaField must be defined", config);
      }
      if (numDefinitions > 1) {
        throw new MorphlineCompilationException(
          "Must define only one of schemaFile or schemaString or schemaField at the same time", config);
      }

      if (schemaString != null) {
        this.fixedSchema = new Parser().parse(schemaString);
      } else if (schemaFile != null) {
        try {
          this.fixedSchema = new Parser().parse(new File(schemaFile));
        } catch (IOException e) {
          throw new MorphlineCompilationException(
            "Cannot parse external Avro schema file: " + schemaFile, config, e);
        }
      } else {
        this.fixedSchema = null;
      }
     
      Config mappingsConfig = getConfigs().getConfig(config, "mappings", ConfigFactory.empty());
      for (Map.Entry<String, Object> entry : new Configs().getEntrySet(mappingsConfig)) {
        mappings.put(entry.getKey(), entry.getValue().toString());
      }
      validateArguments();
    }
   
    @Override
    protected boolean doProcess(Record inputRecord) {
      Schema schema;
      if (schemaField != null) {
        schema = (Schema) inputRecord.getFirstValue(schemaField);
        Preconditions.checkNotNull(schema);
      } else {
        schema = fixedSchema;
      }
     
      Record outputRecord = inputRecord.copy();
      AbstractParser.removeAttachments(outputRecord);
      IndexedRecord avroRecord = new GenericData.Record(schema);
     
      for (Field field : schema.getFields()) {
        String morphlineFieldName = mappings.get(field.name());
        if (morphlineFieldName == null) {
          morphlineFieldName = field.name();
        }
        List list = inputRecord.get(morphlineFieldName);
       
        Object avroResult = ERROR;
        if (field.schema().getType() == Schema.Type.ARRAY) {
          avroResult = toAvro(list, field);
        } else if (list.size() == 0) {
          try { // this will fail if there is no default value
            avroResult = ReflectData.get().getDefaultValue(field);
          } catch (AvroRuntimeException e) {
            avroResult = ERROR;
          }
        } else if (list.size() == 1) {
          avroResult = toAvro(list.get(0), field);
        }
       
        if (avroResult == ERROR) {
          LOG.debug("Cannot convert item: {} to schema: {}", list, schema);
          return false;         
        }
        avroRecord.put(field.pos(), avroResult);
      }

      outputRecord.put(Fields.ATTACHMENT_BODY, avroRecord);
       
      // pass record to next command in chain:
      return super.doProcess(outputRecord);
    }
 
    /* returns true if schema allows the value to be null, false otherwise */
    private static boolean nullOk(Schema schema) {
      if (Schema.Type.NULL == schema.getType()) {
        return true;
      } else if (Schema.Type.UNION == schema.getType()) {
        for (Schema candidate : schema.getTypes()) {
          if (nullOk(candidate)) {
            return true;
          }
        }
      }
      return false;
    }
   
    private Object toAvro(Object item, Field field) {
      if (item == null && !nullOk(field.schema())) {
        try { // this will fail if there is no default value
          return ReflectData.get().getDefaultValue(field);
        } catch (AvroRuntimeException e) {
          return ERROR;
        }
      }
      Object result = toAvro(item, field.schema());
      return result;
    }
   
    private Object toAvro(Object item, Schema schema) {
      // RECORD, ENUM, ARRAY, MAP, UNION, FIXED, STRING, BYTES, INT, LONG, FLOAT,
      // DOUBLE, BOOLEAN, NULL
      switch (schema.getType()) {
        case RECORD:
          if (item instanceof Map) {
            Map<String,Object> map = (Map) item;
            IndexedRecord record = new GenericData.Record(schema);
            for (Field field : schema.getFields()) {
              Object value = map.get(field.name());
              Object result = toAvro(value, field);
              if (result == ERROR) {
                return ERROR;
              }
              record.put(field.pos(), result);
            }
            return record;
          }
          return ERROR;
        case ENUM:
          if (schema.hasEnumSymbol(item.toString())) {
            return item.toString();
          }
          return ERROR;
        case ARRAY:
          if (item instanceof List) {
            ListIterator iter = ((List)item).listIterator();
            while (iter.hasNext()) {
              Object result = toAvro(iter.next(), schema.getElementType());
              if (result == ERROR) {
                return ERROR;
              }
              iter.set(result);
            }
            return item;
          }
          return ERROR;
        case MAP:
          if (item instanceof Map) {
            Map<String,Object> map = (Map) item;
            for (Map.Entry entry : map.entrySet()) {
              if (!(entry.getKey() instanceof CharSequence)) {
                return ERROR; // Avro requires that map keys are CharSequences
              }
              Object result = toAvro(entry.getValue(), schema.getValueType());
              if (result == ERROR) {
                return ERROR;
              }
              entry.setValue(result);
            }
            return item;
          }
          return ERROR;
        case UNION:
          return toAvroUnion(item, schema);
        case FIXED:
          if (item instanceof byte[]) {
            return new GenericData.Fixed(schema, (byte[])item);
          }         
          return ERROR;
        case STRING:
          assert item != null;
          return item.toString();
        case BYTES:
          if (item instanceof ByteBuffer) {
            return item;
          }
          if (item instanceof byte[]) {
            return ByteBuffer.wrap((byte[])item);
         
          return ERROR;
        case INT:
          if (item instanceof Integer) {
            return item;
          }
          if (item instanceof Number) {
            return ((Number) item).intValue();
          }
          try {
            return Integer.valueOf(item.toString());
          } catch (NumberFormatException e) {
            return ERROR;
          }
        case LONG:
          if (item instanceof Long) {
            return item;
          }
          if (item instanceof Number) {
            return ((Number) item).longValue();
          }
          try {
            return Long.valueOf(item.toString());
          } catch (NumberFormatException e) {
            return ERROR;
          }
        case FLOAT:
          if (item instanceof Float) {
            return item;
          }
          if (item instanceof Number) {
            return ((Number) item).floatValue();
          }
          try {
            return Float.valueOf(item.toString());
          } catch (NumberFormatException e) {
            return ERROR;
          }
        case DOUBLE:
          if (item instanceof Double) {
            return item;
          }
          if (item instanceof Number) {
            return ((Number) item).doubleValue();
          }
          try {
            return Double.valueOf(item.toString());
          } catch (NumberFormatException e) {
            return ERROR;
          }
        case BOOLEAN:
          if (item instanceof Boolean) {
            return item;
          }
          assert item != null;
          String str = item.toString();
          if ("true".equals(str)) {
            return Boolean.TRUE;
          }
          if ("false".equals(str)) {
            return Boolean.FALSE;
          }
          return ERROR;
        case NULL:
          if (item == null) {
            return null;
          }
          return ERROR;
        default:
          throw new MorphlineRuntimeException("Unknown Avro schema type: " + schema.getType());
      }
    }

    private Object toAvroUnion(Object item, Schema schema) {
      assert schema.getType() == Schema.Type.UNION;
      List<Schema> types = schema.getTypes();
      int index = -1;
      if (item instanceof Map) {
        // a map can be converted both into an avro record or an avro map.
        // so there's some ambiguity - we choose which one applies based on specified order.
        for (int j = 0; j < types.size(); j++) {
          Schema.Type t = types.get(j).getType();
          if (t == Schema.Type.RECORD || t == Schema.Type.MAP) {
            index = j;
            break;
          }
        }
      } else {
        try {
          // check if there's a perfect fit for a mapping
          index = GenericData.get().resolveUnion(schema, item); // TODO: optimize
        } catch (AvroRuntimeException e) {
          ; // proceed to find first fit based on specified order (see below)
          // LOG.trace("Cannot find perfect fit for item: {} to union schema: {}", item, schema);
        }
      }
     
      if (index >= 0) { // found perfect fit
        Schema candidate = types.get(index);
        Object result = toAvro(item, candidate);
        return result;
      } else { // find first fit based on specified order
        for (Schema candidate : types) {           
          Object result = toAvro(item, candidate);
          if (result != ERROR) {
            return result;
          }
        }
        return ERROR;
      }
    }
   
  }
   
}
TOP

Related Classes of com.cloudera.cdk.morphline.avro.ToAvroBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.