Package com.google.appengine.tools.mapreduce.bigqueryjobs

Source Code of com.google.appengine.tools.mapreduce.bigqueryjobs.BigQueryLoadGoogleCloudStorageFilesJob

package com.google.appengine.tools.mapreduce.bigqueryjobs;

import com.google.api.client.googleapis.extensions.appengine.auth.oauth2.AppIdentityCredential;
import com.google.api.client.googleapis.services.GoogleClientRequestInitializer;
import com.google.api.client.googleapis.services.json.AbstractGoogleJsonClientRequest;
import com.google.api.client.googleapis.services.json.CommonGoogleJsonClientRequestInitializer;
import com.google.api.client.http.HttpTransport;
import com.google.api.client.http.javanet.NetHttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.json.jackson2.JacksonFactory;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.BigqueryRequest;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.appengine.tools.cloudstorage.GcsFilename;
import com.google.appengine.tools.cloudstorage.GcsService;
import com.google.appengine.tools.cloudstorage.GcsServiceFactory;
import com.google.appengine.tools.mapreduce.GoogleCloudStorageFileSet;
import com.google.appengine.tools.mapreduce.Marshallers;
import com.google.appengine.tools.mapreduce.impl.BigQueryConstants;
import com.google.appengine.tools.mapreduce.impl.util.SerializableValue;
import com.google.appengine.tools.mapreduce.outputs.BigQueryStoreResult;
import com.google.appengine.tools.pipeline.FutureValue;
import com.google.appengine.tools.pipeline.Job1;
import com.google.appengine.tools.pipeline.Value;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
* A pipeline job that loads files stored in Google Cloud Storage into a bigquery table.
*/
public final class BigQueryLoadGoogleCloudStorageFilesJob extends
    Job1<List<BigQueryLoadJobReference>, BigQueryStoreResult<GoogleCloudStorageFileSet>> {

  private static final long serialVersionUID = 4162438273017726233L;
  private final String dataset;
  private final String tableName;
  private final String projectId;

  private static final HttpTransport HTTP_TRANSPORT = new NetHttpTransport();
  private static final JsonFactory JSON_FACTORY = new JacksonFactory();

  private static final Logger log =
      Logger.getLogger(BigQueryLoadGoogleCloudStorageFilesJob.class.getName());

  static Bigquery getBigquery() {
    List<String> scopes = Lists.newArrayList();
    scopes.add(BigQueryConstants.BQ_SCOPE);
    AppIdentityCredential credential = new AppIdentityCredential.Builder(scopes).build();
    GoogleClientRequestInitializer initializer = new CommonGoogleJsonClientRequestInitializer() {
      @SuppressWarnings("unused")
      public void initialize(
          @SuppressWarnings("rawtypes") AbstractGoogleJsonClientRequest request) {
        @SuppressWarnings("rawtypes")
        BigqueryRequest bigqueryRequest = (BigqueryRequest) request;
        bigqueryRequest.setPrettyPrint(true);
      }
    };
    return new Bigquery.Builder(HTTP_TRANSPORT, JSON_FACTORY, credential)
        .setHttpRequestInitializer(credential).setGoogleClientRequestInitializer(initializer)
        .build();
  }

  /**
   * @param dataset the name of the bigquery dataset.
   * @param tableName name of the bigquery table to load data.
   * @param projectId bigquery project Id.
   */
  public BigQueryLoadGoogleCloudStorageFilesJob(String dataset, String tableName,
      String projectId) {
    this.dataset = dataset;
    this.tableName = tableName;
    this.projectId = projectId;
  }

  /**
   * Divides the files into bundles having size less than or equal to the maximum size allowed per
   * bigquery load {@link Job} and then starts separate pipeline jobs for each of the bundles.
   * Returns a list of {@link JobReference}s.
   */
  @Override
  public Value<List<BigQueryLoadJobReference>> run(
      BigQueryStoreResult<GoogleCloudStorageFileSet> bigQueryStoreResult) throws Exception {
    BigQueryStoreResult<GoogleCloudStorageFileSet> outputResult = bigQueryStoreResult;
    List<GcsFilename> files = outputResult.getResult().getFiles();
    List<List<GcsFilename>> bundles =
        bundleFiles(files, BigQueryConstants.BIGQUERY_LOAD_DATA_SIZE_LIMIT);
    List<FutureValue<BigQueryLoadJobReference>> jobReferenceList = new ArrayList<>();
    for (List<GcsFilename> bundle : bundles) {
      jobReferenceList.add(futureCall(new BigQueryLoadFileSetJob(dataset, tableName, projectId,
          bundle, SerializableValue.of(Marshallers.getGenericJsonMarshaller(TableSchema.class),
              outputResult.getSchema())), immediate(Integer.valueOf(0))));
    }
    return futureList(jobReferenceList);
  }

  /**
   * @param files list of gcs files to load into bigquery.
   * @param bundleSizeLimit size limit of the bundle of files.
   * @return List of files bundled together so that the combined size of the files is less than the
   *         specified bundle size limit.
   * @throws IOException
   */
  private List<List<GcsFilename>> bundleFiles(List<GcsFilename> files, long bundleSizeLimit)
      throws IOException {
    List<List<GcsFilename>> bundles = new ArrayList<>();
    List<GcsFilename> currentBundle = new ArrayList<>();
    long currentBundleSize = 0;
    GcsService GCS_SERVICE = GcsServiceFactory.createGcsService();
    for (GcsFilename file : files) {
      long fileSize = GCS_SERVICE.getMetadata(file).getLength();
      if (currentBundleSize + fileSize > bundleSizeLimit) {
        bundles.add(ImmutableSet.copyOf(currentBundle).asList());
        currentBundle = new ArrayList<>();
        currentBundleSize = 0;
      }
      currentBundle.add(file);
      currentBundleSize += fileSize;
    }
    bundles.add(ImmutableSet.copyOf(currentBundle).asList());
    return bundles;
  }

  public Value<BigQueryStoreResult<GoogleCloudStorageFileSet>> handleException(Throwable t)
      throws Throwable {
    log.log(Level.SEVERE, "Bigquery data load job failed because of : ", t);
    throw t;
  }
}
TOP

Related Classes of com.google.appengine.tools.mapreduce.bigqueryjobs.BigQueryLoadGoogleCloudStorageFilesJob

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.