Source Code of com.netflix.aegisthus.mapred.reduce.CassSSTableReducer

/**
 * Copyright 2013 Netflix, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.netflix.aegisthus.mapred.reduce;


import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;


import org.apache.cassandra.db.Column;
import org.apache.cassandra.db.ColumnSerializer;
import org.apache.cassandra.db.OnDiskAtom;
import org.apache.cassandra.db.RangeTombstone;
import org.apache.cassandra.db.marshal.BytesType;
import org.apache.cassandra.io.sstable.Descriptor.Version;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.google.common.collect.Lists;
import com.netflix.aegisthus.io.writable.AtomWritable;
import com.netflix.aegisthus.io.writable.CompositeKey;
import com.netflix.aegisthus.tools.StorageHelper;


public class CassSSTableReducer extends Reducer<CompositeKey, AtomWritable, Text, Text> {
    private static final Logger LOG = LoggerFactory.getLogger(CassSSTableReducer.class);
    FSDataOutputStream dos;
    Reduce reduce;


    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        dos.close();
        super.cleanup(context);
    }


    @Override
    public void reduce(CompositeKey key, Iterable<AtomWritable> values, Context ctx) throws IOException, InterruptedException {
        reduce = new Reduce();


        for (AtomWritable value : values) {
            reduce.setKey(value.getKey());
            if (value.getDeletedAt() > reduce.getDeletedAt()) {
                reduce.setDeletedAt(value.getDeletedAt());
            }
            reduce.addAtom(value.getAtom());
        }
        reduce.finalizeReduce();
        reduce.writeRow(dos);
        // TODO: batch this
        ctx.getCounter("aegisthus", "rows_written").increment(1L);
    }


    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        StorageHelper sh = new StorageHelper(context);
        Path outputDir = new Path(sh.getBaseTaskAttemptTempLocation());
        FileSystem fs = outputDir.getFileSystem(context.getConfiguration());
        String filename = String.format("%s-%s-%05d0%04d-Data.db",
                context.getConfiguration().get("aegisthus.dataset", "keyspace-dataset"), Version.current_version,
                context.getTaskAttemptID().getTaskID().getId(), context.getTaskAttemptID().getId());
        Path outputFile = new Path(outputDir, filename);
        LOG.info("writing to: {}", outputFile.toUri());
        dos = fs.create(outputFile);
        super.setup(context);
    }


    static class Reduce {
        private final List<OnDiskAtom> columns = Lists.newArrayList();
        private final OnDiskAtom.Serializer serializer = new OnDiskAtom.Serializer(new ColumnSerializer());
        // TODO: need to get comparator
        private final RangeTombstone.Tracker tombstoneTracker = new RangeTombstone.Tracker(BytesType.instance);
        private OnDiskAtom currentColumn = null;
        private long deletedAt = Long.MIN_VALUE;
        private byte[] key;


        public void setKey(byte[] key) {
            this.key = key;
        }


        @SuppressWarnings("StatementWithEmptyBody")
        public void addAtom(OnDiskAtom atom) {
            if (atom == null) {
                return;
            }
            this.tombstoneTracker.update(atom);
            // Right now, we will only keep columns. This works because we will
            // have all the columns a range tombstone applies to when we create
            // a snapshot. This will not be true if we go to partial incremental
            // processing
            if (atom instanceof Column) {
                Column column = (Column) atom;
                if (this.tombstoneTracker.isDeleted(column)) {
                    // If the column is deleted by the rangeTombstone, just discard
                    // it, every other column of the same name will be discarded as
                    // well, unless it is later than the range tombstone in which
                    // case the column is out of date anyway
                } else if (currentColumn == null) {
                    currentColumn = column;
                } else if (currentColumn.name().equals(column.name())) {
                    if (column.timestamp() > currentColumn.minTimestamp()) {
                        currentColumn = column;
                    }
                } else {
                    columns.add(currentColumn);
                    currentColumn = column;
                }
            } else if (atom instanceof RangeTombstone) {
                // We do not include these columns in the output since they are deleted
            } else {
                throw new IllegalArgumentException("Cassandra added a new type " + atom.getClass().getCanonicalName() + " which we do not support");
            }
        }


        public void finalizeReduce() {
            if (currentColumn != null) {
                columns.add(currentColumn);
            }


            // When cassandra compacts it removes columns that are in deleted rows
            // that are older than the deleted timestamp.
            // we will duplicate this behavior. If the etl needs this data at some
            // point we can change, but it is only available assuming
            // cassandra hasn't discarded it.
            Iterator<OnDiskAtom> columnIterator = columns.iterator();
            while (columnIterator.hasNext()) {
                OnDiskAtom atom = columnIterator.next();
                if (atom instanceof RangeTombstone) {
                    columnIterator.remove();
                } else if (atom instanceof Column && ((Column) atom).timestamp() <= this.deletedAt) {
                    columnIterator.remove();
                }
            }
        }


        public void writeRow(DataOutput dos) throws IOException {
            dos.writeShort(this.key.length);
            dos.write(this.key);
            long dataSize = 16; // The bytes for the Int, Long, Int after this loop
            for (OnDiskAtom atom : columns) {
                dataSize += atom.serializedSizeForSSTable();
            }
            dos.writeLong(dataSize);
            dos.writeInt((int) (this.deletedAt / 1000));
            dos.writeLong(this.deletedAt);
            dos.writeInt(columns.size());
            for (OnDiskAtom atom : columns) {
                serializer.serializeForSSTable(atom, dos);
            }


        }


        public long getDeletedAt() {
            return deletedAt;
        }


        public void setDeletedAt(long deletedAt) {
            this.deletedAt = deletedAt;
        }
    }
}
Source Code of com.netflix.aegisthus.mapred.reduce.CassSSTableReducer

Related Classes of com.netflix.aegisthus.mapred.reduce.CassSSTableReducer