/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gora.cassandra.store;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.Collections;
import java.util.concurrent.ConcurrentHashMap;
import me.prettyprint.hector.api.beans.ColumnSlice;
import me.prettyprint.hector.api.beans.HColumn;
import me.prettyprint.hector.api.beans.HSuperColumn;
import me.prettyprint.hector.api.beans.Row;
import me.prettyprint.hector.api.beans.SuperRow;
import me.prettyprint.hector.api.beans.SuperSlice;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericData.Array;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.specific.SpecificData;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.avro.util.Utf8;
import org.apache.gora.cassandra.query.CassandraQuery;
import org.apache.gora.cassandra.query.CassandraResult;
import org.apache.gora.cassandra.query.CassandraResultSet;
import org.apache.gora.cassandra.query.CassandraRow;
import org.apache.gora.cassandra.query.CassandraSubColumn;
import org.apache.gora.cassandra.query.CassandraSuperColumn;
import org.apache.gora.persistency.Persistent;
import org.apache.gora.persistency.impl.DirtyListWrapper;
import org.apache.gora.persistency.impl.PersistentBase;
import org.apache.gora.query.PartitionQuery;
import org.apache.gora.query.Query;
import org.apache.gora.query.Result;
import org.apache.gora.query.impl.PartitionQueryImpl;
import org.apache.gora.store.DataStoreFactory;
import org.apache.gora.store.impl.DataStoreBase;
import org.apache.gora.cassandra.serializers.AvroSerializerUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@link org.apache.gora.cassandra.store.CassandraStore} is the primary class
* responsible for directing Gora CRUD operations into Cassandra. We (delegate) rely
* heavily on {@link org.apache.gora.cassandra.store.CassandraClient} for many operations
* such as initialization, creating and deleting schemas (Cassandra Keyspaces), etc.
*/
public class CassandraStore<K, T extends PersistentBase> extends DataStoreBase<K, T> {
/** Logging implementation */
public static final Logger LOG = LoggerFactory.getLogger(CassandraStore.class);
/** Consistency property level for Cassandra column families */
private static final String COL_FAM_CL = "cf.consistency.level";
/** Consistency property level for Cassandra read operations. */
private static final String READ_OP_CL = "read.consistency.level";
/** Consistency property level for Cassandra write operations. */
private static final String WRITE_OP_CL = "write.consistency.level";
/** Variables to hold different consistency levels defined by the properties. */
public static String colFamConsLvl;
public static String readOpConsLvl;
public static String writeOpConsLvl;
private CassandraClient<K, T> cassandraClient = new CassandraClient<K, T>();
/**
* Fixed string with value "UnionIndex" used to generate an extra column based on
* the original field's name
*/
public static String UNION_COL_SUFIX = "_UnionIndex";
/**
* Default schema index with value "0" used when AVRO Union data types are stored
*/
public static int DEFAULT_UNION_SCHEMA = 0;
/**
* The values are Avro fields pending to be stored.
*
* We want to iterate over the keys in insertion order.
* We don't want to lock the entire collection before iterating over the keys,
* since in the meantime other threads are adding entries to the map.
*/
private Map<K, T> buffer = Collections.synchronizedMap(new LinkedHashMap<K, T>());
public static final ThreadLocal<BinaryEncoder> encoders =
new ThreadLocal<BinaryEncoder>();
/**
* Create a {@link java.util.concurrent.ConcurrentHashMap} for the
* datum readers and writers.
* This is necessary because they are not thread safe, at least not before
* Avro 1.4.0 (See AVRO-650).
* When they are thread safe, it is possible to maintain a single reader and
* writer pair for every schema, instead of one for every thread.
* @see <a href="https://issues.apache.org/jira/browse/AVRO-650">AVRO-650</a>
*/
public static final ConcurrentHashMap<String, SpecificDatumWriter<?>> writerMap =
new ConcurrentHashMap<String, SpecificDatumWriter<?>>();
/** The default constructor for CassandraStore */
public CassandraStore() throws Exception {
}
/**
* Initialize is called when then the call to
* {@link org.apache.gora.store.DataStoreFactory#createDataStore(Class<D> dataStoreClass, Class<K> keyClass, Class<T> persistent, org.apache.hadoop.conf.Configuration conf)}
* is made. In this case, we merely delegate the store initialization to the
* {@link org.apache.gora.cassandra.store.CassandraClient#initialize(Class<K> keyClass, Class<T> persistentClass)}.
*/
public void initialize(Class<K> keyClass, Class<T> persistent, Properties properties) {
try {
super.initialize(keyClass, persistent, properties);
if (autoCreateSchema) {
// If this is not set, then each Cassandra client should set its default
// column family
colFamConsLvl = DataStoreFactory.findProperty(properties, this, COL_FAM_CL, null);
// operations
readOpConsLvl = DataStoreFactory.findProperty(properties, this, READ_OP_CL, null);
writeOpConsLvl = DataStoreFactory.findProperty(properties, this, WRITE_OP_CL, null);
}
this.cassandraClient.initialize(keyClass, persistent);
} catch (Exception e) {
LOG.error(e.getMessage());
LOG.error(e.getStackTrace().toString());
}
}
@Override
public void close() {
LOG.debug("close");
flush();
}
@Override
public void createSchema() {
LOG.debug("creating Cassandra keyspace");
this.cassandraClient.checkKeyspace();
}
@Override
public boolean delete(K key) {
this.cassandraClient.deleteByKey(key);
return true;
}
@Override
public long deleteByQuery(Query<K, T> query) {
LOG.debug("delete by query " + query);
return 0;
}
@Override
public void deleteSchema() {
LOG.debug("delete schema");
this.cassandraClient.dropKeyspace();
}
/**
* When executing Gora Queries in Cassandra we query the Cassandra keyspace by families.
* When we add sub/supercolumns, Gora keys are mapped to Cassandra partition keys only.
* This is because we follow the Cassandra logic where column family data is
* partitioned across nodes based on row Key.
*/
@Override
public Result<K, T> execute(Query<K, T> query) {
Map<String, List<String>> familyMap = this.cassandraClient.getFamilyMap(query);
Map<String, String> reverseMap = this.cassandraClient.getReverseMap(query);
CassandraQuery<K, T> cassandraQuery = new CassandraQuery<K, T>();
cassandraQuery.setQuery(query);
cassandraQuery.setFamilyMap(familyMap);
CassandraResult<K, T> cassandraResult = new CassandraResult<K, T>(this, query);
cassandraResult.setReverseMap(reverseMap);
CassandraResultSet<K> cassandraResultSet = new CassandraResultSet<K>();
// We query Cassandra keyspace by families.
for (String family : familyMap.keySet()) {
if (family == null) {
continue;
}
if (this.cassandraClient.isSuper(family)) {
addSuperColumns(family, cassandraQuery, cassandraResultSet);
} else {
addSubColumns(family, cassandraQuery, cassandraResultSet);
}
}
cassandraResult.setResultSet(cassandraResultSet);
return cassandraResult;
}
/**
* When we add subcolumns, Gora keys are mapped to Cassandra partition keys only.
* This is because we follow the Cassandra logic where column family data is
* partitioned across nodes based on row Key.
*/
private void addSubColumns(String family, CassandraQuery<K, T> cassandraQuery,
CassandraResultSet<K> cassandraResultSet) {
// select family columns that are included in the query
List<Row<K, ByteBuffer, ByteBuffer>> rows = this.cassandraClient.execute(cassandraQuery, family);
for (Row<K, ByteBuffer, ByteBuffer> row : rows) {
K key = row.getKey();
// find associated row in the resultset
CassandraRow<K> cassandraRow = cassandraResultSet.getRow(key);
if (cassandraRow == null) {
cassandraRow = new CassandraRow<K>();
cassandraResultSet.putRow(key, cassandraRow);
cassandraRow.setKey(key);
}
ColumnSlice<ByteBuffer, ByteBuffer> columnSlice = row.getColumnSlice();
for (HColumn<ByteBuffer, ByteBuffer> hColumn : columnSlice.getColumns()) {
CassandraSubColumn cassandraSubColumn = new CassandraSubColumn();
cassandraSubColumn.setValue(hColumn);
cassandraSubColumn.setFamily(family);
cassandraRow.add(cassandraSubColumn);
}
}
}
/**
* When we add supercolumns, Gora keys are mapped to Cassandra partition keys only.
* This is because we follow the Cassandra logic where column family data is
* partitioned across nodes based on row Key.
*/
private void addSuperColumns(String family, CassandraQuery<K, T> cassandraQuery,
CassandraResultSet<K> cassandraResultSet) {
List<SuperRow<K, String, ByteBuffer, ByteBuffer>> superRows = this.cassandraClient.executeSuper(cassandraQuery, family);
for (SuperRow<K, String, ByteBuffer, ByteBuffer> superRow: superRows) {
K key = superRow.getKey();
CassandraRow<K> cassandraRow = cassandraResultSet.getRow(key);
if (cassandraRow == null) {
cassandraRow = new CassandraRow<K>();
cassandraResultSet.putRow(key, cassandraRow);
cassandraRow.setKey(key);
}
SuperSlice<String, ByteBuffer, ByteBuffer> superSlice = superRow.getSuperSlice();
for (HSuperColumn<String, ByteBuffer, ByteBuffer> hSuperColumn: superSlice.getSuperColumns()) {
CassandraSuperColumn cassandraSuperColumn = new CassandraSuperColumn();
cassandraSuperColumn.setValue(hSuperColumn);
cassandraSuperColumn.setFamily(family);
cassandraRow.add(cassandraSuperColumn);
}
}
}
/**
* Flush the buffer which is a synchronized {@link java.util.LinkedHashMap}
* storing fields pending to be stored by
* {@link org.apache.gora.cassandra.store.CassandraStore#put(Object, PersistentBase)}
* operations. Invoking this method therefore writes the buffered rows
* into Cassandra.
* @see org.apache.gora.store.DataStore#flush()
*/
@Override
public void flush() {
Set<K> keys = this.buffer.keySet();
// this duplicates memory footprint
@SuppressWarnings("unchecked")
K[] keyArray = (K[]) keys.toArray();
// iterating over the key set directly would throw
//ConcurrentModificationException with java.util.HashMap and subclasses
for (K key: keyArray) {
T value = this.buffer.get(key);
if (value == null) {
LOG.info("Value to update is null for key: " + key);
continue;
}
Schema schema = value.getSchema();
for (Field field: schema.getFields()) {
if (value.isDirty(field.pos())) {
addOrUpdateField(key, field, field.schema(), value.get(field.pos()));
}
}
}
// remove flushed rows from the buffer as all
// added or updated fields should now have been written.
for (K key: keyArray) {
this.buffer.remove(key);
}
}
@Override
public T get(K key, String[] fields) {
CassandraQuery<K,T> query = new CassandraQuery<K,T>();
query.setDataStore(this);
query.setKeyRange(key, key);
if (fields == null){
fields = this.getFields();
}
query.setFields(fields);
query.setLimit(1);
Result<K,T> result = execute(query);
boolean hasResult = false;
try {
hasResult = result.next();
} catch (Exception e) {
e.printStackTrace();
}
return hasResult ? result.get() : null;
}
@Override
public List<PartitionQuery<K, T>> getPartitions(Query<K, T> query)
throws IOException {
// TODO GORA-298 Implement CassandraStore#getPartitions
List<PartitionQuery<K,T>> partitions = new ArrayList<PartitionQuery<K,T>>();
PartitionQueryImpl<K, T> pqi = new PartitionQueryImpl<K, T>(query);
pqi.setConf(getConf());
partitions.add(pqi);
return partitions;
}
/**
* In Cassandra Schemas are referred to as Keyspaces
* @return Keyspace
*/
@Override
public String getSchemaName() {
return this.cassandraClient.getKeyspaceName();
}
@Override
public Query<K, T> newQuery() {
Query<K,T> query = new CassandraQuery<K, T>(this);
query.setFields(getFieldsToQuery(null));
return query;
}
/**
*
* When doing the
* {@link org.apache.gora.cassandra.store.CassandraStore#put(Object, PersistentBase)}
* operation, the logic is as follows:
* <ol>
* <li>Obtain the Avro {@link org.apache.avro.Schema} for the object.</li>
* <li>Create a new duplicate instance of the object (explained in more detail below) **.</li>
* <li>Obtain a {@link java.util.List} of the {@link org.apache.avro.Schema}
* {@link org.apache.avro.Schema.Field}'s.</li>
* <li>Iterate through the field {@link java.util.List}. This allows us to
* consequently process each item.</li>
* <li>Check to see if the {@link org.apache.avro.Schema.Field} is NOT dirty.
* If this condition is true then we DO NOT process this field.</li>
* <li>Obtain the element at the specified position in this list so we can
* directly operate on it.</li>
* <li>Obtain the {@link org.apache.avro.Schema.Type} of the element obtained
* above and process it accordingly. N.B. For nested type ARRAY, MAP
* RECORD or UNION, we shadow the checks in bullet point 5 above to infer that the
* {@link org.apache.avro.Schema.Field} is either at
* position 0 OR it is NOT dirty. If one of these conditions is true then we DO NOT
* process this field. This is carried out in
* {@link org.apache.gora.cassandra.store.CassandraStore#getFieldValue(Schema, Type, Object)}</li>
* <li>We then insert the Key and Object into the {@link java.util.LinkedHashMap} buffer
* before being flushed. This performs a structural modification of the map.</li>
* </ol>
* ** We create a duplicate instance of the object to be persisted and insert processed
* objects into a synchronized {@link java.util.LinkedHashMap}. This allows
* us to keep all the objects in memory till flushing.
* @see org.apache.gora.store.DataStore#put(java.lang.Object,
* org.apache.gora.persistency.Persistent).
* @param key for the Avro Record (object).
* @param value Record object to be persisted in Cassandra
*/
@Override
public void put(K key, T value) {
Schema schema = value.getSchema();
@SuppressWarnings("unchecked")
T p = (T) SpecificData.get().newRecord(value, schema);
List<Field> fields = schema.getFields();
for (int i = 1; i < fields.size(); i++) {
if (!value.isDirty(i)) {
continue;
}
Field field = fields.get(i);
Type type = field.schema().getType();
Object fieldValue = value.get(field.pos());
Schema fieldSchema = field.schema();
// check if field has a nested structure (array, map, record or union)
fieldValue = getFieldValue(fieldSchema, type, fieldValue);
p.put(field.pos(), fieldValue);
}
// this performs a structural modification of the map
this.buffer.put(key, p);
}
/**
* For every field within an object, we pass in a field schema, Type and value.
* This enables us to process fields (based on their characteristics)
* preparing them for persistence.
* @param fieldSchema the associated field schema
* @param type the field type
* @param fieldValue the field value.
* @return
*/
private Object getFieldValue(Schema fieldSchema, Type type, Object fieldValue ){
switch(type) {
case RECORD:
Persistent persistent = (Persistent) fieldValue;
Persistent newRecord = (Persistent) SpecificData.get().newRecord(persistent, persistent.getSchema());
for (Field member: fieldSchema.getFields()) {
if (member.pos() == 0 || !persistent.isDirty()) {
continue;
}
Schema memberSchema = member.schema();
Type memberType = memberSchema.getType();
Object memberValue = persistent.get(member.pos());
newRecord.put(member.pos(), getFieldValue(memberSchema, memberType, memberValue));
}
fieldValue = newRecord;
break;
case MAP:
Map<?, ?> map = (Map<?, ?>) fieldValue;
fieldValue = map;
break;
case ARRAY:
fieldValue = (List<?>) fieldValue;
break;
case UNION:
// storing the union selected schema, the actual value will
// be stored as soon as we get break out.
if (fieldValue != null){
int schemaPos = getUnionSchema(fieldValue,fieldSchema);
Schema unionSchema = fieldSchema.getTypes().get(schemaPos);
Type unionType = unionSchema.getType();
fieldValue = getFieldValue(unionSchema, unionType, fieldValue);
}
//p.put( schemaPos, p.getSchema().getField(field.name() + CassandraStore.UNION_COL_SUFIX));
//p.put(fieldPos, fieldValue);
break;
default:
break;
}
return fieldValue;
}
/**
* Add a field to Cassandra according to its type.
* @param key the key of the row where the field should be added
* @param field the Avro field representing a datum
* @param schema the schema belonging to the particular Avro field
* @param value the field value
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private void addOrUpdateField(K key, Field field, Schema schema, Object value) {
Type type = schema.getType();
// checking if the value to be updated is used for saving union schema
if (field.name().indexOf(CassandraStore.UNION_COL_SUFIX) < 0){
switch (type) {
case STRING:
case BOOLEAN:
case INT:
case LONG:
case BYTES:
case FLOAT:
case DOUBLE:
case FIXED:
this.cassandraClient.addColumn(key, field.name(), value);
break;
case RECORD:
if (value != null) {
if (value instanceof PersistentBase) {
PersistentBase persistentBase = (PersistentBase) value;
try {
byte[] byteValue = AvroSerializerUtil.serializer(persistentBase, schema);
this.cassandraClient.addColumn(key, field.name(), byteValue);
} catch (IOException e) {
LOG.warn(field.name() + " named record could not be serialized.");
}
} else {
LOG.warn("Record with value: " + value.toString() + " not supported for field: " + field.name());
}
} else {
LOG.warn("Setting content of: " + field.name() + " to null.");
String familyName = this.cassandraClient.getCassandraMapping().getFamily(field.name());
this.cassandraClient.deleteColumn(key, familyName, this.cassandraClient.toByteBuffer(field.name()));
}
break;
case MAP:
if (value != null) {
if (value instanceof Map<?, ?>) {
Map<CharSequence,Object> map = (Map<CharSequence,Object>)value;
Schema valueSchema = schema.getValueType();
Type valueType = valueSchema.getType();
if (Type.UNION.equals(valueType)){
Map<CharSequence,Object> valueMap = new HashMap<CharSequence, Object>();
for (CharSequence mapKey: map.keySet()) {
Object mapValue = map.get(mapKey);
int valueUnionIndex = getUnionSchema(mapValue, valueSchema);
valueMap.put((mapKey+UNION_COL_SUFIX), valueUnionIndex);
valueMap.put(mapKey, mapValue);
}
map = valueMap;
}
String familyName = this.cassandraClient.getCassandraMapping().getFamily(field.name());
// If map is not super column. We using Avro serializer.
if (!this.cassandraClient.isSuper( familyName )){
try {
byte[] byteValue = AvroSerializerUtil.serializer(map, schema);
this.cassandraClient.addColumn(key, field.name(), byteValue);
} catch (IOException e) {
LOG.warn(field.name() + " named map could not be serialized.");
}
}else{
this.cassandraClient.addStatefulHashMap(key, field.name(), map);
}
} else {
LOG.warn("Map with value: " + value.toString() + " not supported for field: " + field.name());
}
} else {
// delete map
LOG.warn("Setting content of: " + field.name() + " to null.");
this.cassandraClient.deleteStatefulHashMap(key, field.name());
}
break;
case ARRAY:
if (value != null) {
if (value instanceof DirtyListWrapper<?>) {
DirtyListWrapper fieldValue = (DirtyListWrapper<?>)value;
GenericArray valueArray = new Array(fieldValue.size(), schema);
for (int i = 0; i < fieldValue.size(); i++) {
valueArray.add(i, fieldValue.get(i));
}
this.cassandraClient.addGenericArray(key, field.name(), (GenericArray<?>)valueArray);
} else {
LOG.warn("Array with value: " + value.toString() + " not supported for field: " + field.name());
}
} else {
LOG.warn("Setting content of: " + field.name() + " to null.");
this.cassandraClient.deleteGenericArray(key, field.name());
}
break;
case UNION:
// adding union schema index
String columnName = field.name() + UNION_COL_SUFIX;
String familyName = this.cassandraClient.getCassandraMapping().getFamily(field.name());
if(value != null) {
int schemaPos = getUnionSchema(value, schema);
LOG.debug("Union with value: " + value.toString() + " at index: " + schemaPos + " supported for field: " + field.name());
this.cassandraClient.getCassandraMapping().addColumn(familyName, columnName, columnName);
if (this.cassandraClient.isSuper( familyName )){
this.cassandraClient.addSubColumn(key, columnName, columnName, schemaPos);
}else{
this.cassandraClient.addColumn(key, columnName, schemaPos);
}
//this.cassandraClient.getCassandraMapping().addColumn(familyName, columnName, columnName);
// adding union value
Schema unionSchema = schema.getTypes().get(schemaPos);
addOrUpdateField(key, field, unionSchema, value);
//this.cassandraClient.addColumn(key, field.name(), value);
} else {
LOG.warn("Setting content of: " + field.name() + " to null.");
if (this.cassandraClient.isSuper( familyName )){
this.cassandraClient.deleteSubColumn(key, field.name());
} else {
this.cassandraClient.deleteColumn(key, familyName, this.cassandraClient.toByteBuffer(field.name()));
}
}
break;
default:
LOG.warn("Type: " + type.name() + " not considered for field: " + field.name() + ". Please report this to dev@gora.apache.org");
}
}
}
/**
* Given an object and the object schema this function obtains,
* from within the UNION schema, the position of the type used.
* If no data type can be inferred then we return a default value
* of position 0.
* @param pValue
* @param pUnionSchema
* @return the unionSchemaPosition.
*/
private int getUnionSchema(Object pValue, Schema pUnionSchema){
int unionSchemaPos = 0;
// String valueType = pValue.getClass().getSimpleName();
Iterator<Schema> it = pUnionSchema.getTypes().iterator();
while ( it.hasNext() ){
Type schemaType = it.next().getType();
if (pValue instanceof Utf8 && schemaType.equals(Type.STRING))
return unionSchemaPos;
else if (pValue instanceof ByteBuffer && schemaType.equals(Type.BYTES))
return unionSchemaPos;
else if (pValue instanceof Integer && schemaType.equals(Type.INT))
return unionSchemaPos;
else if (pValue instanceof Long && schemaType.equals(Type.LONG))
return unionSchemaPos;
else if (pValue instanceof Double && schemaType.equals(Type.DOUBLE))
return unionSchemaPos;
else if (pValue instanceof Float && schemaType.equals(Type.FLOAT))
return unionSchemaPos;
else if (pValue instanceof Boolean && schemaType.equals(Type.BOOLEAN))
return unionSchemaPos;
else if (pValue instanceof Map && schemaType.equals(Type.MAP))
return unionSchemaPos;
else if (pValue instanceof List && schemaType.equals(Type.ARRAY))
return unionSchemaPos;
else if (pValue instanceof Persistent && schemaType.equals(Type.RECORD))
return unionSchemaPos;
unionSchemaPos ++;
}
// if we weren't able to determine which data type it is, then we return the default
return DEFAULT_UNION_SCHEMA;
}
/**
* Simple method to check if a Cassandra Keyspace exists.
* @return true if a Keyspace exists.
*/
@Override
public boolean schemaExists() {
LOG.info("schema exists");
return cassandraClient.keyspaceExists();
}
}