Source Code of com.google.appengine.tools.mapreduce.impl.GoogleCloudStorageMergeOutput$OrderSlicingOutputWriter

package com.google.appengine.tools.mapreduce.impl;


import static com.google.common.base.Preconditions.checkNotNull;


import com.google.appengine.tools.cloudstorage.GcsFilename;
import com.google.appengine.tools.mapreduce.KeyValue;
import com.google.appengine.tools.mapreduce.Marshaller;
import com.google.appengine.tools.mapreduce.Marshallers;
import com.google.appengine.tools.mapreduce.Output;
import com.google.appengine.tools.mapreduce.OutputWriter;
import com.google.appengine.tools.mapreduce.impl.sort.LexicographicalComparator;
import com.google.appengine.tools.mapreduce.impl.util.SerializableValue;
import com.google.appengine.tools.mapreduce.outputs.GoogleCloudStorageFileOutputWriter;
import com.google.appengine.tools.mapreduce.outputs.GoogleCloudStorageLevelDbOutputWriter;
import com.google.appengine.tools.mapreduce.outputs.ItemSegmentingOutputWriter;
import com.google.appengine.tools.mapreduce.outputs.MarshallingOutputWriter;
import com.google.common.collect.ImmutableList;


import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;


/**
 * Defines the way data is written out by the merging stage. This consists of multiple GCS files
 * using Level Db format. Each file contains KeyValue pairs ordered by their key (same format as the
 * sort output).
 *
 */
public class GoogleCloudStorageMergeOutput extends
    Output<KeyValue<ByteBuffer, List<ByteBuffer>>, FilesByShard> {


  private static final long serialVersionUID = 8332978108336443982L;


  private final String bucket;
  private final String mrJobId;
  private final Integer tier;


  public GoogleCloudStorageMergeOutput(String bucket, String mrJobId, Integer tier) {
    this.tier = checkNotNull(tier, "Null tier");
    this.bucket = checkNotNull(bucket, "Null bucket");
    this.mrJobId = checkNotNull(mrJobId, "Null mrJobId");
  }


  private static class OrderSlicingOutputWriter extends
      ItemSegmentingOutputWriter<KeyValue<ByteBuffer, List<ByteBuffer>>> {


    private static final long serialVersionUID = -2300946785845673658L;
    private static final Marshaller<ByteBuffer> MARSHALLER = Marshallers.getByteBufferMarshaller();
    private final String bucket;
    private final String fileNamePattern;
    private final List<String> fileNames;
    private SerializableValue<ByteBuffer> lastKey;


    /**
     * @param fileNamePattern a Java format string {@link java.util.Formatter} containing one int
     *        argument for the slice number.
     */
    public OrderSlicingOutputWriter(String bucket, String fileNamePattern) {
      this.bucket = checkNotNull(bucket, "Null bucket");
      this.fileNamePattern = checkNotNull(fileNamePattern, "Null fileNamePattern");
      this.fileNames = new ArrayList<>();
    }


    @Override
    public MarshallingOutputWriter<KeyValue<ByteBuffer, List<ByteBuffer>>> createNextWriter(
        int sliceNumber) {
      Marshaller<ByteBuffer> identity = Marshallers.getByteBufferMarshaller();
      String fileName = String.format(fileNamePattern, sliceNumber);
      fileNames.add(fileName);
      return new MarshallingOutputWriter<>(
          new GoogleCloudStorageLevelDbOutputWriter(new GoogleCloudStorageFileOutputWriter(
              new GcsFilename(bucket, fileName), MapReduceConstants.REDUCE_INPUT_MIME_TYPE)),
          Marshallers.getKeyValuesMarshaller(identity, identity));
    }


    public List<String> getFilesCreated() {
      return fileNames;
    }


    @Override
    public long estimateMemoryRequirement() {
      return GoogleCloudStorageFileOutputWriter.MEMORY_REQUIRED;
    }


    @Override
    protected boolean shouldSegment(KeyValue<ByteBuffer, List<ByteBuffer>> value) {
      boolean result = lastKey != null
          && LexicographicalComparator.compareBuffers(lastKey.getValue(), value.getKey()) > 0;
      lastKey = SerializableValue.of(MARSHALLER, value.getKey());
      return result;
    }
  }


  /**
   * Returns a writer that writes the data the same way that the sort does, splitting the output
   * every time the key goes backwards in sequence. This way the output is a collection of files
   * that are individually fully sorted. This works in conjunction with
   * {@link GoogleCloudStorageMergeInput} to convert a large number of sorted files into a much
   * smaller number of sorted files.
   */
  @Override
  public List<? extends OutputWriter<KeyValue<ByteBuffer, List<ByteBuffer>>>> createWriters(
      int shards) {
    ImmutableList.Builder<OutputWriter<KeyValue<ByteBuffer, List<ByteBuffer>>>> result =
        new ImmutableList.Builder<>();
    for (int i = 0; i < shards; i++) {
      result.add(new OrderSlicingOutputWriter(bucket,
          String.format(MapReduceConstants.MERGE_OUTPUT_DIR_FORMAT, mrJobId, tier, i)));
    }
    return result.build();
  }


  @Override
  @SuppressWarnings("unchecked")
  public FilesByShard finish(
      Collection<? extends OutputWriter<KeyValue<ByteBuffer, List<ByteBuffer>>>> writers) {
    FilesByShard filesByShard = new FilesByShard(writers.size(), bucket);
    int shard = 0;
    for (OutputWriter<?> w : writers) {
      OrderSlicingOutputWriter writer = (OrderSlicingOutputWriter) w;
      filesByShard.addFilesToShard(shard++, writer.getFilesCreated());
    }
    return filesByShard;
  }
}
Source Code of com.google.appengine.tools.mapreduce.impl.GoogleCloudStorageMergeOutput$OrderSlicingOutputWriter

Related Classes of com.google.appengine.tools.mapreduce.impl.GoogleCloudStorageMergeOutput$OrderSlicingOutputWriter