/*
* Copyright © 2014 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.internal.app.runtime.spark;
import co.cask.cdap.api.data.batch.BatchReadable;
import co.cask.cdap.api.data.batch.Split;
import co.cask.cdap.api.dataset.Dataset;
import co.cask.cdap.api.spark.SparkContext;
import co.cask.cdap.api.spark.SparkSpecification;
import co.cask.cdap.app.runtime.Arguments;
import co.cask.cdap.internal.app.runtime.batch.dataset.DataSetInputFormat;
import co.cask.cdap.internal.app.runtime.batch.dataset.DataSetOutputFormat;
import co.cask.cdap.internal.app.runtime.spark.dataset.SparkDatasetInputFormat;
import co.cask.cdap.internal.app.runtime.spark.dataset.SparkDatasetOutputFormat;
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.spark.SparkConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URL;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
/**
* An abstract class which implements {@link SparkContext} and provide a concrete implementation for the common
* functionality between {@link JavaSparkContext} and {@link ScalaSparkContext}
*/
abstract class AbstractSparkContext implements SparkContext {
private static final Logger LOG = LoggerFactory.getLogger(AbstractSparkContext.class);
private static final Pattern SPACES = Pattern.compile("\\s+");
private static final String[] NO_ARGS = {};
private final Configuration hConf;
private final long logicalStartTime;
private final SparkSpecification spec;
private final Arguments runtimeArguments;
final BasicSparkContext basicSparkContext;
private final SparkConf sparkConf;
public AbstractSparkContext() {
hConf = loadHConf();
// Create an instance of BasicSparkContext from the Hadoop Configuration file which was just loaded
SparkContextProvider sparkContextProvider = new SparkContextProvider(hConf);
basicSparkContext = sparkContextProvider.get();
this.logicalStartTime = basicSparkContext.getLogicalStartTime();
this.spec = basicSparkContext.getSpecification();
this.runtimeArguments = basicSparkContext.getRuntimeArgs();
this.sparkConf = initializeSparkConf();
}
/**
* Initializes the {@link SparkConf} with proper settings.
*
* @return the initialized {@link SparkConf}
*/
private SparkConf initializeSparkConf() {
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName(basicSparkContext.getProgramName());
return sparkConf;
}
public SparkConf getSparkConf() {
return sparkConf;
}
Configuration getHConf() {
return hConf;
}
/**
* Adds the supplied {@link Configuration} file as an resource
* This configuration is needed to read/write {@link Dataset} using {@link DataSetInputFormat}/{@link
* DataSetOutputFormat} by {@link JavaSparkContext#readFromDataset(String, Class, Class)} or
* {@link ScalaSparkContext#readFromDataset(String, Class, Class)}
* This function requires that the hConf.xml file containing {@link Configuration} is present in the job jar.
*/
private Configuration loadHConf() {
// TODO: Inject through Guice in Distributed mode, see CDAP-3
Configuration hConf = new Configuration();
hConf.clear();
URL url = Thread.currentThread().getContextClassLoader().getResource(SparkRuntimeService.SPARK_HCONF_FILENAME);
if (url == null) {
LOG.error("Unable to find Hadoop Configuration file {} in the submitted jar.",
SparkRuntimeService.SPARK_HCONF_FILENAME);
throw new RuntimeException("Hadoop Configuration file not found in the supplied jar. Please include Hadoop " +
"Configuration file with name " + SparkRuntimeService.SPARK_HCONF_FILENAME);
}
hConf.addResource(url);
return hConf;
}
/**
* Sets the input {@link Dataset} with splits in the {@link Configuration}
*
* @param datasetName the name of the {@link Dataset} to read from
* @return updated {@link Configuration}
* @throws {@link IllegalArgumentException} if the {@link Dataset} to read is not {@link BatchReadable}
*/
Configuration setInputDataset(String datasetName) {
Configuration hConf = new Configuration(getHConf());
Dataset dataset = basicSparkContext.getDataSet(datasetName);
List<Split> inputSplits;
if (dataset instanceof BatchReadable) {
BatchReadable curDataset = (BatchReadable) dataset;
inputSplits = curDataset.getSplits();
} else {
throw new IllegalArgumentException("Failed to read dataset " + datasetName + ". The dataset does not implement" +
" BatchReadable");
}
hConf.setClass(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SparkDatasetInputFormat.class, InputFormat.class);
hConf.set(SparkDatasetInputFormat.HCONF_ATTR_INPUT_DATASET, datasetName);
hConf.set(SparkContextConfig.HCONF_ATTR_INPUT_SPLIT_CLASS, inputSplits.get(0).getClass().getName());
hConf.set(SparkContextConfig.HCONF_ATTR_INPUT_SPLITS, new Gson().toJson(inputSplits));
return hConf;
}
/**
* Sets the output {@link Dataset} with splits in the {@link Configuration}
*
* @param datasetName the name of the {@link Dataset} to write to
* @return updated {@link Configuration}
*/
Configuration setOutputDataset(String datasetName) {
Configuration hConf = new Configuration(getHConf());
hConf.set(SparkDatasetOutputFormat.HCONF_ATTR_OUTPUT_DATASET, datasetName);
hConf.setClass(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, SparkDatasetOutputFormat.class, OutputFormat.class);
return hConf;
}
/**
* Returns value of the given argument key as a String[]
*
* @param argsKey {@link String} which is the key for the argument
* @return String[] containing all the arguments which is indexed by their position as they were supplied
*/
@Override
public String[] getRuntimeArguments(String argsKey) {
if (runtimeArguments.hasOption(argsKey)) {
return SPACES.split(runtimeArguments.getOption(argsKey).trim());
} else {
LOG.warn("Argument with key {} not found in Runtime Arguments", argsKey);
return NO_ARGS;
}
}
@Override
public SparkSpecification getSpecification() {
return spec;
}
@Override
public long getLogicalStartTime() {
return logicalStartTime;
}
@Override
public Map<String, String> getRuntimeArguments() {
ImmutableMap.Builder<String, String> arguments = ImmutableMap.builder();
for (Map.Entry<String, String> runtimeArgument : runtimeArguments) {
arguments.put(runtimeArgument);
}
return arguments.build();
}
}