Package com.google.appengine.tools.mapreduce.impl

Source Code of com.google.appengine.tools.mapreduce.impl.GoogleCloudStorageMapOutput$ShardingOutputWriterImpl

package com.google.appengine.tools.mapreduce.impl;

import static com.google.appengine.tools.mapreduce.impl.MapReduceConstants.MAP_OUTPUT_MIME_TYPE;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import com.google.appengine.tools.cloudstorage.GcsFilename;
import com.google.appengine.tools.mapreduce.KeyValue;
import com.google.appengine.tools.mapreduce.Marshaller;
import com.google.appengine.tools.mapreduce.Output;
import com.google.appengine.tools.mapreduce.OutputWriter;
import com.google.appengine.tools.mapreduce.Sharder;
import com.google.appengine.tools.mapreduce.outputs.GoogleCloudStorageFileOutputWriter;
import com.google.appengine.tools.mapreduce.outputs.GoogleCloudStorageLevelDbOutputWriter;
import com.google.appengine.tools.mapreduce.outputs.MarshallingOutputWriter;
import com.google.appengine.tools.mapreduce.outputs.ShardingOutputWriter;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;

/**
* Defines the format the data is written out of from the Mapper. This consists of a number of files
* in GCS where the content is split up by the provided {@link Sharder} each of these is written in
* LevelDb Format and then using the {@link KeyValueMarshaller} to marshall the individual record.
*
*
* @param <K> type of intermediate keys
* @param <V> type of intermediate values
*/
public class GoogleCloudStorageMapOutput<K, V> extends Output<KeyValue<K, V>, FilesByShard> {

  private static final long serialVersionUID = 1929076635709093020L;

  private final String mrJobId;
  private final String bucket;
  private final Marshaller<K> keyMarshaller;
  private final Marshaller<V> valueMarshaller;

  private final Sharder sharder;

  public GoogleCloudStorageMapOutput(String bucket, String mrJobId, Marshaller<K> keyMarshaller,
      Marshaller<V> valueMarshaller, Sharder sharder) {
    this.bucket = checkNotNull(bucket, "Null bucket");
    this.sharder = checkNotNull(sharder, "Null sharder");
    this.mrJobId = checkNotNull(mrJobId, "Null mrJobId");
    checkArgument(sharder.getNumShards() >= 0);
    this.keyMarshaller = checkNotNull(keyMarshaller, "Null keyMarshaller");
    this.valueMarshaller = checkNotNull(valueMarshaller, "Null valueMarshaller");
  }

  private static class ShardingOutputWriterImpl<K, V> extends
      ShardingOutputWriter<K, V, OutputWriter<KeyValue<K, V>>> {

    private static final long serialVersionUID = 1674046447114388281L;
    private final String fileNamePattern;
    private final String bucket;
    private final Marshaller<K> keyMarshaller;
    private final Marshaller<V> valueMarshaller;

    public ShardingOutputWriterImpl(String bucket, String fileNamePattern,
        Marshaller<K> keyMarshaller, Marshaller<V> valueMarshaller, Sharder sharder) {
      super(keyMarshaller, sharder);
      this.bucket = bucket;
      this.fileNamePattern = fileNamePattern;
      this.keyMarshaller = keyMarshaller;
      this.valueMarshaller = valueMarshaller;
    }

    @Override
    public OutputWriter<KeyValue<K, V>> createWriter(int sortShard) {
      GcsFilename file = new GcsFilename(bucket, getFileName(sortShard));
      MarshallingOutputWriter<KeyValue<K, V>> output =
          new MarshallingOutputWriter<>(new GoogleCloudStorageLevelDbOutputWriter(
              new GoogleCloudStorageFileOutputWriter(file, MAP_OUTPUT_MIME_TYPE)),
              new KeyValueMarshaller<>(keyMarshaller, valueMarshaller));
      return output;
    }

    @Override
    protected Map<Integer, OutputWriter<KeyValue<K, V>>> getShardsToWriterMap() {
      return super.getShardsToWriterMap();
    }

    @Override
    public long estimateMemoryRequirement() {
      return sharder.getNumShards() * GoogleCloudStorageFileOutputWriter.MEMORY_REQUIRED;
    }

    private String getFileName(int sortShard) {
      return String.format(fileNamePattern, sortShard);
    }
  }

  @Override
  public List<? extends OutputWriter<KeyValue<K, V>>> createWriters(int shards) {
    List<ShardingOutputWriterImpl<K, V>> result = new ArrayList<>(shards);
    for (int i = 0; i < shards; i++) {
      String fileNamePattern = // Filled in later
          String.format(MapReduceConstants.MAP_OUTPUT_DIR_FORMAT, mrJobId, i);
      ShardingOutputWriterImpl<K, V> shardingWriter = new ShardingOutputWriterImpl<>(bucket,
          fileNamePattern, keyMarshaller, valueMarshaller, sharder);
      result.add(shardingWriter);
    }
    return result;
  }

  @Override
  public FilesByShard finish(Collection<? extends OutputWriter<KeyValue<K, V>>> writers)
      throws IOException {
    FilesByShard result = new FilesByShard(sharder.getNumShards(), bucket);
    for (OutputWriter<KeyValue<K, V>> writer : writers) {
      @SuppressWarnings("unchecked")
      ShardingOutputWriterImpl<K, V> shardingWriter = (ShardingOutputWriterImpl<K, V>) writer;
      for (int sortShard : shardingWriter.getShardsToWriterMap().keySet()) {
        result.addFileToShard(sortShard, shardingWriter.getFileName(sortShard));
      }
    }
    return result;
  }
}
TOP

Related Classes of com.google.appengine.tools.mapreduce.impl.GoogleCloudStorageMapOutput$ShardingOutputWriterImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.