Source Code of org.kiji.schema.impl.AvroCellEncoder$FinalSchemaEncoder

/**
 * (c) Copyright 2012 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.kiji.schema.impl;


import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;


import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Collections2;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericContainer;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.Encoder;
import org.apache.avro.io.EncoderFactory;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import org.kiji.annotations.ApiAudience;
import org.kiji.schema.DecodedCell;
import org.kiji.schema.InternalKijiError;
import org.kiji.schema.Kiji;
import org.kiji.schema.KijiCellEncoder;
import org.kiji.schema.KijiColumnName;
import org.kiji.schema.KijiEncodingException;
import org.kiji.schema.KijiIOException;
import org.kiji.schema.KijiURI;
import org.kiji.schema.avro.AvroSchema;
import org.kiji.schema.avro.AvroValidationPolicy;
import org.kiji.schema.avro.SchemaStorage;
import org.kiji.schema.avro.TableLayoutDesc;
import org.kiji.schema.impl.hbase.HBaseKiji;
import org.kiji.schema.impl.hbase.HBaseTableLayoutUpdater;
import org.kiji.schema.layout.AvroSchemaResolver;
import org.kiji.schema.layout.CellSpec;
import org.kiji.schema.layout.InvalidLayoutException;
import org.kiji.schema.layout.KijiTableLayout;
import org.kiji.schema.layout.SchemaTableAvroResolver;
import org.kiji.schema.layout.TableLayoutBuilder;
import org.kiji.schema.util.ByteStreamArray;
import org.kiji.schema.util.BytesKey;


/**
 * Serializes Avro cells to bytes for persistence in HBase.
 *
 * <p>
 *   An Avro cell encoder is specific to one column in a KijiTable.
 *   Depending on the column specification, Avro cells embed the writer schema or not.
 *   When embedded, the Avro schema ID/hash is prepended to the encoded value.
 * <p>
 */
@ApiAudience.Private
public final class AvroCellEncoder implements KijiCellEncoder {
  /** Name of the system property to control schema validation. */
  public static final String SCHEMA_VALIDATION_POLICY =
      "org.kiji.schema.impl.AvroCellEncoder.SCHEMA_VALIDATION_POLICY";


  private static final Logger LOG = LoggerFactory.getLogger(AvroCellEncoder.class);


  /** Mapping from class names of Avro primitives to their corresponding Avro schemas. */
  public static final Map<String, Schema> PRIMITIVE_SCHEMAS;
  static {
    final Schema booleanSchema = Schema.create(Schema.Type.BOOLEAN);
    final Schema intSchema = Schema.create(Schema.Type.INT);
    final Schema longSchema = Schema.create(Schema.Type.LONG);
    final Schema floatSchema = Schema.create(Schema.Type.FLOAT);
    final Schema doubleSchema = Schema.create(Schema.Type.DOUBLE);
    final Schema stringSchema = Schema.create(Schema.Type.STRING);


    // Initialize primitive schema mapping.
    PRIMITIVE_SCHEMAS = ImmutableMap
        .<String, Schema>builder()
        .put(boolean.class.getCanonicalName(), booleanSchema)
        .put(Boolean.class.getCanonicalName(), booleanSchema)
        .put(int.class.getCanonicalName(), intSchema)
        .put(Integer.class.getCanonicalName(), intSchema)
        .put(long.class.getCanonicalName(), longSchema)
        .put(Long.class.getCanonicalName(), longSchema)
        .put(float.class.getCanonicalName(), floatSchema)
        .put(Float.class.getCanonicalName(), floatSchema)
        .put(double.class.getCanonicalName(), doubleSchema)
        .put(Double.class.getCanonicalName(), doubleSchema)
        .put(String.class.getCanonicalName(), stringSchema)
        .put(org.apache.avro.util.Utf8.class.getCanonicalName(), stringSchema)
        .put(java.nio.ByteBuffer.class.getCanonicalName(), Schema.create(Schema.Type.BYTES))
        .build();
  }


  private static final Schema NULL_SCHEMA = Schema.create(Schema.Type.NULL);


  /**
   * Reports the Avro schema validation policy.
   *
   * @param cellSpec to get the Avro schema validation policy from.
   * @return the schema validation policy.
   */
  private static AvroValidationPolicy getAvroValidationPolicy(final CellSpec cellSpec) {
    final String validationPolicy = System.getProperty(SCHEMA_VALIDATION_POLICY);
    if (validationPolicy != null) {
      try {
        return AvroValidationPolicy.valueOf(validationPolicy);
      } catch (IllegalArgumentException iae) {
        throw new KijiEncodingException(
            String.format("Unrecognized validation policy: %s", validationPolicy), iae);
      }
    } else {
      return cellSpec.getCellSchema().getAvroValidationPolicy();
    }
  }


  /** Specification of the column to encode. */
  private final CellSpec mCellSpec;


  /** Schema encoder. */
  private final SchemaEncoder mSchemaEncoder;


  /**
   * Cache of Avro DatumWriter.
   *
   * <p>
   *   Avro datum writers aren't thread-safe, but if we ensure the schema of a datum writer is not
   *   modified, the datum writer becomes thread-safe.
   * </p>
   *
   * <p>
   *   This cache is not globally shared at present.
   *   To share this map globally (ie. static) requires using a WeakIdentityHashMap:
   *   a weak map is required to garbage collect unused schemas;
   *   an identity map is also required as Schema.hashCode/equals are imperfect.
   * </p>
   */
  private final Map<Schema, DatumWriter<Object>> mCachedDatumWriters = Maps.newHashMap();


  /**
   * A byte stream for when encoding to a byte array.
   *
   * Since we use the same instance for all encodings, this makes the encoder thread-unsafe.
   */
  private final ByteArrayOutputStream mByteArrayOutputStream = new ByteArrayOutputStream();


  /** An encoder that writes to the above byte stream. */
  private final Encoder mByteArrayEncoder =
      EncoderFactory.get().directBinaryEncoder(mByteArrayOutputStream, null);


  /**
   * Configured reader schema for the column to encode.
   *
   * This may currently be null if we only know the fully-qualified name of the record.
   * Eventually, this will always be populated so we can validate records being written against
   * the present reader schema.
   */
  private final Schema mReaderSchema;


  /**
   * Writer schemas registered for the column that this cell encoder will encode cells for.
   *
   * Note: This will be null if schema validation is disabled.
   */
  private final Set<Schema> mRegisteredWriters;


  /**
   * Avro validation policy to use when encoding cells. This may be overridden using the
   * "org.kiji.schema.impl.AvroCellEncoder.SCHEMA_VALIDATION_POLICY" system property.
   */
  private final AvroValidationPolicy mValidationPolicy;


  // -----------------------------------------------------------------------------------------------


  /**
   * Encodes the writer schema.
   */
  private interface SchemaEncoder {
    /**
     * Encodes the writer schema in the cell.
     *
     * @param writerSchema Avro schema of the data being encoded.
     * @throws IOException on I/O error.
     */
    void encode(Schema writerSchema) throws IOException;
  }


  // -----------------------------------------------------------------------------------------------


  /** Schema encoders that uses a hash of the schema. */
  private class SchemaHashEncoder implements SchemaEncoder {
    /** {@inheritDoc} */
    @Override
    public void encode(final Schema writerSchema) throws IOException {
      final BytesKey schemaHash = mCellSpec.getSchemaTable().getOrCreateSchemaHash(writerSchema);
      mByteArrayEncoder.writeFixed(schemaHash.getBytes());
    }
  }


  // -----------------------------------------------------------------------------------------------


  /** Schema encoders that uses the UID of the schema. */
  private class SchemaIdEncoder implements SchemaEncoder {
    /** {@inheritDoc} */
    @Override
    public void encode(final Schema writerSchema) throws IOException {
      final long schemaId = mCellSpec.getSchemaTable().getOrCreateSchemaId(writerSchema);
      mByteArrayEncoder.writeFixed(ByteStreamArray.longToVarInt64(schemaId));
    }
  }


  // -----------------------------------------------------------------------------------------------


  /**
   * Schema encoders for final columns.
   *
   * <p>
   *   Schema is not encoded as part of the HBase cell.
   *   However, the Avro schema of the cell value must exactly match the column reader schema.
   *   In other words, the writer schema must be the reader schema at all times.
   * </p>
   */
  private static class FinalSchemaEncoder implements SchemaEncoder {
    /** Creates an encoder for a schema of a final column. */
    public FinalSchemaEncoder() {
    }


    /** {@inheritDoc} */
    @Override
    public void encode(final Schema writerSchema) throws IOException {
      // Nothing to encode, because the writer schema is already encoded in the column layout.
      // This means the writer schema must be exactly the declared reader schema.
    }
  }


  /**
   * Creates a schema encoder for the specified cell encoding.
   *
   * @param cellSpec Specification of the cell to encode.
   * @return a schema encoder for the specified cell encoding.
   * @throws IOException on I/O error.
   */
  private SchemaEncoder createSchemaEncoder(final CellSpec cellSpec) throws IOException {
    switch (cellSpec.getCellSchema().getStorage()) {
    case HASH: return new SchemaHashEncoder();
    case UID: return new SchemaIdEncoder();
    case FINAL: return new FinalSchemaEncoder();
    default:
      throw new RuntimeException(
          "Unexpected cell format: " + cellSpec.getCellSchema().getStorage());
    }
  }


  // -----------------------------------------------------------------------------------------------


  /**
   * Creates a new <code>KijiCellEncoder</code> instance.
   *
   * @param cellSpec Specification of the cell to encode.
   * @throws IOException on I/O error.
   */
  public AvroCellEncoder(final CellSpec cellSpec) throws IOException {
    mCellSpec = Preconditions.checkNotNull(cellSpec);
    Preconditions.checkArgument(cellSpec.isAvro());
    mReaderSchema = mCellSpec.getAvroSchema();
    mSchemaEncoder = createSchemaEncoder(mCellSpec);
    mRegisteredWriters = flattenAvroUnions(getRegisteredWriters(mCellSpec));
    mValidationPolicy = getAvroValidationPolicy(mCellSpec);
  }


  /** {@inheritDoc} */
  @Override
  public byte[] encode(final DecodedCell<?> cell) throws IOException {
    return encode(cell.getData());
  }


  /** {@inheritDoc} */
  @Override
  public synchronized <T> byte[] encode(final T cellValue) throws IOException {
    mByteArrayOutputStream.reset();


    // Get the writer schema for this cell.
    final Schema writerSchema = getWriterSchema(cellValue);


    // Perform avro schema validation (if necessary).
    switch (mValidationPolicy) {
      case STRICT: {
        if (!mRegisteredWriters.contains(writerSchema)) {
          throw new KijiEncodingException(
              String.format("Error trying to use unregistered writer schema: %s",
                  writerSchema.toString(true)));
        }
        break;
      }
      case DEVELOPER: {
        if (!mRegisteredWriters.contains(writerSchema)) {
          LOG.info("Writer schema {} is currently not registered for column {}, registering now.",
              writerSchema, mCellSpec.getColumnURI());
          if (mCellSpec.getColumnURI() == null) {
            throw new InternalKijiError("CellSpec has no column URI: " + mCellSpec);
          }


          registerWriterSchema(mCellSpec.getColumnURI(), writerSchema);
        }
        break;
      }
      case NONE:
        // No-op. No validation required.
        break;
      case SCHEMA_1_0:
        // No-op. Validation happens for primitive types only during Avro serialization by setting
        // the writer schema (see getWriterSchema()).
        break;
      default: {
        throw new KijiEncodingException(
            String.format("Unrecognized schema validation policy: %s",
                mValidationPolicy.toString()));
      }
    }


    // Perform final column schema validation (if necessary).
    if (mCellSpec.getCellSchema().getStorage() == SchemaStorage.FINAL
        && !writerSchema.equals(mReaderSchema)) {
      throw new KijiEncodingException(
          String.format("Writer schema: %s does not match final column schema: %s",
              writerSchema.toString(true),
              mReaderSchema.toString(true)));
    }


    // Encode the Avro schema (if necessary):
    mSchemaEncoder.encode(writerSchema);


    // Encode the cell value:
    try {
      getDatumWriter(writerSchema).write(cellValue, mByteArrayEncoder);
    } catch (ClassCastException cce) {
      throw new KijiEncodingException(cce);
    } catch (AvroRuntimeException ure) {
      throw new KijiEncodingException(ure);
    }
    return mByteArrayOutputStream.toByteArray();
  }


  /**
   * Gets a datum writer for a schema and caches it.
   *
   * <p> Not thread-safe, calls to this method must be externally synchronized. </p>
   *
   * @param schema The writer schema.
   * @return A datum writer for the given schema.
   */
  private DatumWriter<Object> getDatumWriter(final Schema schema) {
    final DatumWriter<Object> existing = mCachedDatumWriters.get(schema);
    if (null != existing) {
      return existing;
    }
    final DatumWriter<Object> newWriter = new SpecificDatumWriter<Object>(schema);
    mCachedDatumWriters.put(schema, newWriter);
    return newWriter;
  }


  /**
   * Gets the writer schema of a specified value.
   *
   * @param <T> is the java type of the specified value.
   * @param cellValue to get the Avro schema of.
   * @return an Avro schema representing the type of data specified.
   * @throws KijiEncodingException if no Avro writer schema can be inferred from cellValue.
   */
  private <T> Schema getWriterSchema(final T cellValue) {
    if (cellValue == null) {
      return NULL_SCHEMA;
    } else if (cellValue instanceof GenericContainer) {
      return ((GenericContainer) cellValue).getSchema();
    } else if (mValidationPolicy == AvroValidationPolicy.SCHEMA_1_0) {
      // Compute the writer schema using old semantics. This will only validate primitive schemas.
      return mReaderSchema;
    } else {
      final String className = cellValue.getClass().getCanonicalName();
      final Schema schema = PRIMITIVE_SCHEMAS.get(className);
      if (schema == null) {
        throw new KijiEncodingException(String.format(
            "Unable to infer Avro writer schema for value: '%s'", cellValue));
      }
      return schema;
    }
  }


  /**
   * Gets the registered writer schemas associated with the provided cell specification.
   *
   * @param spec containing registered schemas.
   * @return the set of writer schemas registered for the provided cell.
   *     Null if validation is disabled.
   * @throws IOException if there is an error looking up schemas.
   */
  private static Set<Schema> getRegisteredWriters(final CellSpec spec) throws IOException {
    final List<AvroSchema> writerSchemas = spec.getCellSchema().getWriters();
    if (writerSchemas == null) {
      return null;
    }
    final AvroSchemaResolver resolver = new SchemaTableAvroResolver(spec.getSchemaTable());
    return Sets.newHashSet(Collections2.transform(writerSchemas, resolver));
  }


  /**
   * Flatten and expand the unions from a set of schemas.
   *
   * <p>
   *   For example, the set
   *     <code>{union(null, int), union(null, string)</code>
   *   will be expanded to
   *     <code>{null, int, string, union(null, int), union(null, string)}</code>.
   * </p>
   *
   * @param schemas Set of Avro schemas whose unions are to be expanded.
   * @return the expanded (flattened) set of Avro schemas.
   *     Null iff schemas is null.
   */
  private static Set<Schema> flattenAvroUnions(final Set<Schema> schemas) {
    if (schemas == null) {
      return null;
    }
    final Set<Schema> expanded = Sets.newHashSet();
    for (Schema schema : schemas) {
      expanded.add(schema);
      if (schema.getType() == Schema.Type.UNION) {
        for (Schema branch : schema.getTypes()) {
          Preconditions.checkArgument(branch.getType() != Schema.Type.UNION, branch);
          expanded.add(branch);
        }
      }
    }
    return expanded;
  }


  /**
   * Computes the layout ID directly following a given layout ID.
   *
   * <p>
   *   Increments the layout ID if it is an integer.
   *   Otherwise, forge a layout ID containing a timestamp.
   * </p>
   *
   * @param layoutId Layout ID to compute the next sequential ID from.
   * @return the next sequential layout ID.
   */
  private static String nextLayoutId(String layoutId) {
    try {
      final long lid = Long.parseLong(layoutId);
      return Long.toString(lid + 1);
    } catch (NumberFormatException nfe) {
      return String.format("layout-developer-%d", System.currentTimeMillis());
    }
  }


  /**
   * Registers a new writer schema in a given column.
   *
   * @param columnURI Full URI of the column for which to register a new writer schema.
   * @param writerSchema Writer schema to register.
   * @throws IOException on I/O error.
   */
  private static void registerWriterSchema(final KijiURI columnURI, final Schema writerSchema)
      throws IOException {
    Preconditions.checkArgument(columnURI.getColumns().size() == 1,
        "Expecting exactly one column in URI, got: %s", columnURI);
    final KijiColumnName column = columnURI.getColumns().get(0);


    // TODO(???) the layout updater interface is currently HBase specific.
    //     We should make a backend agnostic API for layout updates.
    final HBaseKiji kiji = (HBaseKiji) Kiji.Factory.open(columnURI);
    try {
      final Function<KijiTableLayout, TableLayoutDesc> update =
          new Function<KijiTableLayout, TableLayoutDesc>() {
            /** {@inheritDoc} */
            @Override
            public TableLayoutDesc apply(final KijiTableLayout refLayout) {
              Preconditions.checkNotNull(refLayout);
              try {
                final TableLayoutDesc refDesc = refLayout.getDesc();
                return new TableLayoutBuilder(refDesc, kiji)
                    .withLayoutId(nextLayoutId(refDesc.getLayoutId()))
                    .withWriter(column, writerSchema)
                    .withWritten(column, writerSchema)
                    .build();
              } catch (InvalidLayoutException ile) {
                LOG.error("Internal error while updating table layout in DEVELOPER mode: {}", ile);
                throw new InternalKijiError(ile);
              } catch (IOException ioe) {
                LOG.error("I/O error while updating table layout in DEVELOPER mode: {}", ioe);
                throw new KijiIOException(ioe);
              }
            }
          };


      try {
        final HBaseTableLayoutUpdater updater =
            new HBaseTableLayoutUpdater(kiji, columnURI, update);
        try {
          updater.update();
        } finally {
          updater.close();
        }
      } catch (KeeperException ke) {
        throw new IOException(ke);
      }
    } finally {
      kiji.release();
    }
  }
}
Source Code of org.kiji.schema.impl.AvroCellEncoder$FinalSchemaEncoder

Related Classes of org.kiji.schema.impl.AvroCellEncoder$FinalSchemaEncoder