Package com.linkedin.camus.etl.kafka

Source Code of com.linkedin.camus.etl.kafka.CamusJob

package com.linkedin.camus.etl.kafka;

import com.linkedin.camus.etl.kafka.common.DateUtils;
import com.linkedin.camus.etl.kafka.common.EtlCounts;
import com.linkedin.camus.etl.kafka.common.EtlKey;
import com.linkedin.camus.etl.kafka.common.ExceptionWritable;
import com.linkedin.camus.etl.kafka.common.Source;
import com.linkedin.camus.etl.kafka.mapred.EtlInputFormat;
import com.linkedin.camus.etl.kafka.mapred.EtlMapper;
import com.linkedin.camus.etl.kafka.mapred.EtlMultiOutputFormat;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;
import java.util.Comparator;
import java.util.Arrays;
import java.util.regex.Pattern;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.mapred.TIPStatus;
import org.apache.hadoop.mapred.TaskCompletionEvent;
import org.apache.hadoop.mapred.TaskReport;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.apache.log4j.xml.DOMConfigurator;
import org.codehaus.jackson.map.DeserializationConfig;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.type.TypeReference;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormatter;

public class CamusJob extends Configured implements Tool {

  public static final String ETL_EXECUTION_BASE_PATH = "etl.execution.base.path";
  public static final String ETL_EXECUTION_HISTORY_PATH = "etl.execution.history.path";
  public static final String ETL_COUNTS_PATH = "etl.counts.path";
  public static final String ETL_KEEP_COUNT_FILES = "etl.keep.count.files";
  public static final String ETL_BASEDIR_QUOTA_OVERIDE = "etl.basedir.quota.overide";
  public static final String ETL_EXECUTION_HISTORY_MAX_OF_QUOTA = "etl.execution.history.max.of.quota";
    public static final String ETL_FAIL_ON_ERRORS = "etl.fail.on.errors";
  public static final String ZK_AUDIT_HOSTS = "zookeeper.audit.hosts";
  public static final String KAFKA_MONITOR_TIER = "kafka.monitor.tier";
  public static final String CAMUS_MESSAGE_ENCODER_CLASS = "camus.message.encoder.class";
  public static final String BROKER_URI_FILE = "brokers.uri";
  public static final String POST_TRACKING_COUNTS_TO_KAFKA = "post.tracking.counts.to.kafka";
  public static final String KAFKA_FETCH_REQUEST_MAX_WAIT = "kafka.fetch.request.max.wait";
  public static final String KAFKA_FETCH_REQUEST_MIN_BYTES = "kafka.fetch.request.min.bytes";
  public static final String KAFKA_FETCH_REQUEST_CORRELATION_ID = "kafka.fetch.request.correlationid";
  public static final String KAFKA_CLIENT_NAME = "kafka.client.name";
  public static final String KAFKA_FETCH_BUFFER_SIZE = "kafka.fetch.buffer.size";
  public static final String KAFKA_BROKERS = "kafka.brokers";
  public static final String KAFKA_HOST_URL = "kafka.host.url";
  public static final String KAFKA_HOST_PORT = "kafka.host.port";
  public static final String KAFKA_TIMEOUT_VALUE = "kafka.timeout.value";
  public static final String LOG4J_CONFIGURATION = "log4j.configuration";
  private static org.apache.log4j.Logger log;

  private final Properties props;
 
  private DateTimeFormatter dateFmt = DateUtils.getDateTimeFormatter(
      "YYYY-MM-dd-HH-mm-ss", DateTimeZone.UTC);

  public CamusJob() throws IOException {
    this(new Properties());
  }

  public CamusJob(Properties props) throws IOException {
    this(props, org.apache.log4j.Logger.getLogger(CamusJob.class));
  }
 
   public CamusJob(Properties props, Logger log) throws IOException {
      this.props = props;
      this.log = log;
    }

  private static HashMap<String, Long> timingMap = new HashMap<String, Long>();

  public static void startTiming(String name) {
    timingMap.put(name,
        (timingMap.get(name) == null ? 0 : timingMap.get(name))
            - System.currentTimeMillis());
  }

  public static void stopTiming(String name) {
    timingMap.put(name,
        (timingMap.get(name) == null ? 0 : timingMap.get(name))
            + System.currentTimeMillis());
  }

  public static void setTime(String name) {
    timingMap.put(name,
        (timingMap.get(name) == null ? 0 : timingMap.get(name))
            + System.currentTimeMillis());
  }
 
  private Job createJob(Properties props) throws IOException {
    Job job;
    if(getConf() == null)
      {
        setConf(new Configuration());
      }
   
    populateConf(props, getConf(), log);
   
    job = new Job(getConf());
    job.setJarByClass(CamusJob.class);
   
     if(job.getConfiguration().get("camus.job.name") != null)
      {
        job.setJobName(job.getConfiguration().get("camus.job.name"));
      }
     else
     {
       job.setJobName("Camus Job");
     }
    
    return job;
  }

  public static void populateConf(Properties props, Configuration conf, Logger log) throws IOException {
    for(Object key : props.keySet())
    {
      conf.set(key.toString(), props.getProperty(key.toString()));
    }

    FileSystem fs = FileSystem.get(conf);

    String hadoopCacheJarDir = conf.get(
        "hdfs.default.classpath.dir", null);
   
    List<Pattern> jarFilterString = new ArrayList<Pattern>();
   
    for (String str : Arrays.asList(conf.getStrings("cache.jar.filter.list", new String[0]))){
      jarFilterString.add(Pattern.compile(str));
    }
   
    if (hadoopCacheJarDir != null) {
      FileStatus[] status = fs.listStatus(new Path(hadoopCacheJarDir));

      if (status != null) {
        for (int i = 0; i < status.length; ++i) {
          if (!status[i].isDir()) {
            log.info("Adding Jar to Distributed Cache Archive File:"
                + status[i].getPath());
            boolean filterMatch = false;
            for (Pattern p : jarFilterString){
              if (p.matcher(status[i].getPath().getName()).matches()){
                filterMatch = true;
                break;
              }
            }
           
            if (! filterMatch)
              DistributedCache
                .addFileToClassPath(status[i].getPath(),
                    conf, fs);
          }
        }
      } else {
        System.out.println("hdfs.default.classpath.dir "
            + hadoopCacheJarDir + " is empty.");
      }
    }

    // Adds External jars to hadoop classpath
    String externalJarList = conf.get(
        "hadoop.external.jarFiles", null);
    if (externalJarList != null) {
      String[] jarFiles = externalJarList.split(",");
      for (String jarFile : jarFiles) {
        log.info("Adding external jar File:" + jarFile);
        boolean filterMatch = false;
        for (Pattern p : jarFilterString){
          if (p.matcher(new Path(jarFile).getName()).matches()){
            filterMatch = true;
            break;
          }
        }
       
        if (! filterMatch)
          DistributedCache.addFileToClassPath(new Path(jarFile),
            conf, fs);
      }
    }
  }

  public void run() throws Exception {

    startTiming("pre-setup");
    startTiming("total");
    Job job = createJob(props);
    if (getLog4jConfigure(job)) {
      DOMConfigurator.configure("log4j.xml");
    }
    FileSystem fs = FileSystem.get(job.getConfiguration());

    log.info("Dir Destination set to: "
        + EtlMultiOutputFormat.getDestinationPath(job));

    Path execBasePath = new Path(props.getProperty(ETL_EXECUTION_BASE_PATH));
    Path execHistory = new Path(
        props.getProperty(ETL_EXECUTION_HISTORY_PATH));

    if (!fs.exists(execBasePath)) {
      log.info("The execution base path does not exist. Creating the directory");
      fs.mkdirs(execBasePath);
    }
    if (!fs.exists(execHistory)) {
      log.info("The history base path does not exist. Creating the directory.");
      fs.mkdirs(execHistory);
    }

    // enforcing max retention on the execution directories to avoid
    // exceeding HDFS quota. retention is set to a percentage of available
    // quota.
    ContentSummary content = fs.getContentSummary(execBasePath);
    long limit = (long) (content.getQuota() * job.getConfiguration()
        .getFloat(ETL_EXECUTION_HISTORY_MAX_OF_QUOTA, (float) .5));
    limit = limit == 0 ? 50000 : limit;
   
    if (props.containsKey(ETL_BASEDIR_QUOTA_OVERIDE)){
      limit = Long.valueOf(props.getProperty(ETL_BASEDIR_QUOTA_OVERIDE));
    }

    long currentCount = content.getFileCount()
        + content.getDirectoryCount();

    FileStatus[] executions = fs.listStatus(execHistory);
    Arrays.sort(executions, new Comparator<FileStatus>() {
      public int compare(FileStatus f1, FileStatus f2) {
        return f1.getPath().getName().compareTo(f2.getPath().getName());
      }
    });
   
    // removes oldest directory until we get under required % of count
    // quota. Won't delete the most recent directory.
    for (int i = 0; i < executions.length - 1 && limit < currentCount; i++) {
      FileStatus stat = executions[i];
      log.info("removing old execution: " + stat.getPath().getName());
      ContentSummary execContent = fs.getContentSummary(stat.getPath());
      currentCount -= execContent.getFileCount() + execContent.getDirectoryCount();
      fs.delete(stat.getPath(), true);
    }
   
    // removing failed exectutions if we need room
    if (limit < currentCount){
      FileStatus[] failedExecutions = fs.listStatus(execBasePath, new PathFilter() {
       
        public boolean accept(Path path) {
          try {
            dateFmt.parseDateTime(path.getName());
            return true;
          } catch (IllegalArgumentException e){
            return false;
          }
        }
      });
     
      Arrays.sort(failedExecutions, new Comparator<FileStatus>() {
        public int compare(FileStatus f1, FileStatus f2) {
          return f1.getPath().getName().compareTo(f2.getPath().getName());
        }
      });
     
      for (int i = 0; i < failedExecutions.length && limit < currentCount; i++) {
        FileStatus stat = failedExecutions[i];
        log.info("removing failed execution: " + stat.getPath().getName());
        ContentSummary execContent = fs.getContentSummary(stat.getPath());
        currentCount -= execContent.getFileCount() + execContent.getDirectoryCount();
        fs.delete(stat.getPath(), true);
      }
    }

    // determining most recent execution and using as the starting point for
    // this execution
    if (executions.length > 0) {
      Path previous = executions[executions.length - 1].getPath();
      FileInputFormat.setInputPaths(job, previous);
      log.info("Previous execution: " + previous.toString());
    } else {
      System.out
          .println("No previous execution, all topics pulled from earliest available offset");
    }

    // creating new execution dir. offsets, error_logs, and count files will
    // be written to this directory. data is not written to the
    // output directory in a normal run, but instead written to the
    // appropriate date-partitioned subdir in camus.destination.path
    DateTimeFormatter dateFmt = DateUtils.getDateTimeFormatter(
        "YYYY-MM-dd-HH-mm-ss", DateTimeZone.UTC);
    String executionDate = new DateTime().toString(dateFmt);
    Path newExecutionOutput = new Path(execBasePath, executionDate);
    FileOutputFormat.setOutputPath(job, newExecutionOutput);
    log.info("New execution temp location: "
        + newExecutionOutput.toString());

    EtlInputFormat.setLogger(log);
    job.setMapperClass(EtlMapper.class);
   
    job.setInputFormatClass(EtlInputFormat.class);
    job.setOutputFormatClass(EtlMultiOutputFormat.class);
    job.setNumReduceTasks(0);

    stopTiming("pre-setup");
    job.submit();
    job.waitForCompletion(true);

    // dump all counters
    Counters counters = job.getCounters();
    for (String groupName : counters.getGroupNames()) {
      CounterGroup group = counters.getGroup(groupName);
      log.info("Group: " + group.getDisplayName());
      for (Counter counter : group) {
        log.info(counter.getDisplayName() + ":\t" + counter.getValue());
      }
    }

    stopTiming("hadoop");
    startTiming("commit");

        // Send Tracking counts to Kafka
        sendTrackingCounts(job, fs, newExecutionOutput);

        Map<EtlKey, ExceptionWritable> errors = readErrors(fs, newExecutionOutput);

    // Print any potential errors encountered
        if (!errors.isEmpty())
            log.error("Errors encountered during job run:");

        for(Entry<EtlKey, ExceptionWritable> entry : errors.entrySet()) {
            log.error(entry.getKey().toString());
            log.error(entry.getValue().toString());
        }

    Path newHistory = new Path(execHistory, executionDate);
    log.info("Moving execution to history : " + newHistory);
    fs.rename(newExecutionOutput, newHistory);

    log.info("Job finished");
    stopTiming("commit");
    stopTiming("total");
    createReport(job, timingMap);

    if (!job.isSuccessful()) {
      JobClient client = new JobClient(
          new JobConf(job.getConfiguration()));

      TaskCompletionEvent[] tasks = job.getTaskCompletionEvents(0);

      for (TaskReport task : client.getMapTaskReports(tasks[0]
          .getTaskAttemptId().getJobID())) {
        if (task.getCurrentStatus().equals(TIPStatus.FAILED)) {
          for (String s : task.getDiagnostics()) {
            System.err.println("task error: " + s);
          }
        }
      }
      throw new RuntimeException("hadoop job failed");
    }

        if(!errors.isEmpty() && props.getProperty(ETL_FAIL_ON_ERRORS, Boolean.FALSE.toString())
                .equalsIgnoreCase(Boolean.TRUE.toString())) {
            throw new RuntimeException("Camus saw errors, check stderr");
        }
  }

  public Map<EtlKey, ExceptionWritable> readErrors(FileSystem fs, Path newExecutionOutput)
      throws IOException {
        Map<EtlKey, ExceptionWritable> errors = new HashMap<EtlKey, ExceptionWritable>();

    for (FileStatus f : fs.listStatus(newExecutionOutput, new PrefixFilter(
        EtlMultiOutputFormat.ERRORS_PREFIX))) {
      SequenceFile.Reader reader = new SequenceFile.Reader(fs,
          f.getPath(), fs.getConf());

            String errorFrom = "\nError from file [" + f.getPath() + "]";

      EtlKey key = new EtlKey();
      ExceptionWritable value = new ExceptionWritable();

      while (reader.next(key, value)) {
                ExceptionWritable exceptionWritable = new ExceptionWritable(value.toString() + errorFrom);
                errors.put(new EtlKey(key), exceptionWritable);
      }
      reader.close();
    }

        return errors;
  }

  // Posts the tracking counts to Kafka
  public void sendTrackingCounts(JobContext job, FileSystem fs,
      Path newExecutionOutput) throws IOException, URISyntaxException {
    if (EtlMultiOutputFormat.isRunTrackingPost(job)) {
      FileStatus[] gstatuses = fs.listStatus(newExecutionOutput,
          new PrefixFilter("counts"));
      HashMap<String, EtlCounts> allCounts = new HashMap<String, EtlCounts>();
      for (FileStatus gfileStatus : gstatuses) {
        FSDataInputStream fdsis = fs.open(gfileStatus.getPath());

        BufferedReader br = new BufferedReader(new InputStreamReader(
            fdsis), 1048576);
        StringBuffer buffer = new StringBuffer();
        String temp = "";
        while ((temp = br.readLine()) != null) {
          buffer.append(temp);
        }
        ObjectMapper mapper = new ObjectMapper();
        mapper.configure(
            DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES,
            false);
        ArrayList<EtlCounts> countsObjects = mapper.readValue(
            buffer.toString(),
            new TypeReference<ArrayList<EtlCounts>>() {
            });

        for (EtlCounts count : countsObjects) {
          String topic = count.getTopic();
          if (allCounts.containsKey(topic)) {
            EtlCounts existingCounts = allCounts.get(topic);
            existingCounts
                .setEndTime(Math.max(
                    existingCounts.getEndTime(),
                    count.getEndTime()));
            existingCounts.setLastTimestamp(Math.max(
                existingCounts.getLastTimestamp(),
                count.getLastTimestamp()));
            existingCounts.setStartTime(Math.min(
                existingCounts.getStartTime(),
                count.getStartTime()));
            existingCounts.setFirstTimestamp(Math.min(
                existingCounts.getFirstTimestamp(),
                count.getFirstTimestamp()));
            existingCounts.setErrorCount(existingCounts
                .getErrorCount() + count.getErrorCount());
            existingCounts.setGranularity(count.getGranularity());
            existingCounts.setTopic(count.getTopic());
            for (Entry<String, Source> entry : count.getCounts()
                .entrySet()) {
              Source source = entry.getValue();
              if (existingCounts.getCounts().containsKey(
                  source.toString())) {
                Source old = existingCounts.getCounts().get(
                    source.toString());
                old.setCount(old.getCount() + source.getCount());
                existingCounts.getCounts().put(old.toString(),
                    old);
              } else {
                existingCounts.getCounts().put(
                    source.toString(), source);
              }
              allCounts.put(topic, existingCounts);
            }
          } else {
            allCounts.put(topic, count);
          }
        }
      }

      for (FileStatus countFile : fs.listStatus(newExecutionOutput,
          new PrefixFilter("counts"))) {
        if (props.getProperty(ETL_KEEP_COUNT_FILES, "false").equals(
            "true")) {
          fs.rename(countFile.getPath(),
              new Path(props.getProperty(ETL_COUNTS_PATH),
                  countFile.getPath().getName()));
        } else {
          fs.delete(countFile.getPath(), true);
        }
      }

      String brokerList = getKafkaBrokers(job);
      for (EtlCounts finalCounts : allCounts.values()) {
        finalCounts.postTrackingCountToKafka(job.getConfiguration(),
            props.getProperty(KAFKA_MONITOR_TIER), brokerList);
      }
    }
  }

  /**
   * Creates a diagnostic report mostly focused on timing breakdowns. Useful
   * for determining where to optimize.
   *
   * @param job
   * @param timingMap
   * @throws IOException
   */
  private void createReport(Job job, Map<String, Long> timingMap)
      throws IOException {
    StringBuilder sb = new StringBuilder();

    sb.append("***********Timing Report*************\n");

    sb.append("Job time (seconds):\n");

    double preSetup = timingMap.get("pre-setup") / 1000;
    double getSplits = timingMap.get("getSplits") / 1000;
    double hadoop = timingMap.get("hadoop") / 1000;
    double commit = timingMap.get("commit") / 1000;
    double total = timingMap.get("total") / 1000;

    sb.append(String.format("    %12s %6.1f (%s)\n", "pre setup", preSetup,
        NumberFormat.getPercentInstance().format(preSetup / total)
            .toString()));
    sb.append(String.format("    %12s %6.1f (%s)\n", "get splits",
        getSplits,
        NumberFormat.getPercentInstance().format(getSplits / total)
            .toString()));
    sb.append(String.format("    %12s %6.1f (%s)\n", "hadoop job", hadoop,
        NumberFormat.getPercentInstance().format(hadoop / total)
            .toString()));
    sb.append(String.format("    %12s %6.1f (%s)\n", "commit", commit,
        NumberFormat.getPercentInstance().format(commit / total)
            .toString()));

    int minutes = (int) total / 60;
    int seconds = (int) total % 60;

    sb.append(String.format("Total: %d minutes %d seconds\n", minutes,
        seconds));

    JobClient client = new JobClient(new JobConf(job.getConfiguration()));

    TaskReport[] tasks = client.getMapTaskReports(JobID.downgrade(job
        .getJobID()));

    double min = Long.MAX_VALUE, max = 0, mean = 0;
    double minRun = Long.MAX_VALUE, maxRun = 0, meanRun = 0;
    long totalTaskTime = 0;
    TreeMap<Long, List<TaskReport>> taskMap = new TreeMap<Long, List<TaskReport>>();

    for (TaskReport t : tasks) {
      long wait = t.getStartTime() - timingMap.get("hadoop_start");
      min = wait < min ? wait : min;
      max = wait > max ? wait : max;
      mean += wait;

      long runTime = t.getFinishTime() - t.getStartTime();
      totalTaskTime += runTime;
      minRun = runTime < minRun ? runTime : minRun;
      maxRun = runTime > maxRun ? runTime : maxRun;
      meanRun += runTime;

      if (!taskMap.containsKey(runTime)) {
        taskMap.put(runTime, new ArrayList<TaskReport>());
      }
      taskMap.get(runTime).add(t);
    }

    mean /= tasks.length;
    meanRun /= tasks.length;

    // convert to seconds
    min /= 1000;
    max /= 1000;
    mean /= 1000;
    minRun /= 1000;
    maxRun /= 1000;
    meanRun /= 1000;

    sb.append("\nHadoop job task times (seconds):\n");
    sb.append(String.format("    %12s %6.1f\n", "min", minRun));
    sb.append(String.format("    %12s %6.1f\n", "mean", meanRun));
    sb.append(String.format("    %12s %6.1f\n", "max", maxRun));
    sb.append(String.format("    %12s %6.1f/%.1f = %.2f\n", "skew",
        meanRun, maxRun, meanRun / maxRun));

    sb.append("\nTask wait time (seconds):\n");
    sb.append(String.format("    %12s %6.1f\n", "min", min));
    sb.append(String.format("    %12s %6.1f\n", "mean", mean));
    sb.append(String.format("    %12s %6.1f\n", "max", max));

    CounterGroup totalGrp = job.getCounters().getGroup("total");

    long decode = totalGrp.findCounter("decode-time(ms)").getValue();
    long request = totalGrp.findCounter("request-time(ms)").getValue();
    long map = totalGrp.findCounter("mapper-time(ms)").getValue();
    long mb = totalGrp.findCounter("data-read").getValue();

    long other = totalTaskTime - map - request - decode;

    sb.append("\nHadoop task breakdown:\n");
    sb.append(String.format("    %12s %s\n", "kafka", NumberFormat
        .getPercentInstance().format(request / (double) totalTaskTime)));
    sb.append(String.format("    %12s %s\n", "decode", NumberFormat
        .getPercentInstance().format(decode / (double) totalTaskTime)));
    sb.append(String.format("    %12s %s\n", "map output", NumberFormat
        .getPercentInstance().format(map / (double) totalTaskTime)));
    sb.append(String.format("    %12s %s\n", "other", NumberFormat
        .getPercentInstance().format(other / (double) totalTaskTime)));

    sb.append(String.format("\n%16s %s\n", "Total MB read:",
        mb / 1024 / 1024));

    log.info(sb.toString());
  }

  /**
   * Path filter that filters based on prefix
   */
  private class PrefixFilter implements PathFilter {
    private final String prefix;

    public PrefixFilter(String prefix) {
      this.prefix = prefix;
    }

    public boolean accept(Path path) {
      // TODO Auto-generated method stub
      return path.getName().startsWith(prefix);
    }
  }

  public static void main(String[] args) throws Exception {
    CamusJob job = new CamusJob();
    ToolRunner.run(job, args);
  }

  @SuppressWarnings("static-access")
  @Override
  public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption("p", true, "properties filename from the classpath");
    options.addOption("P", true, "external properties filename");

    options.addOption(OptionBuilder.withArgName("property=value")
        .hasArgs(2).withValueSeparator()
        .withDescription("use value for given property").create("D"));

    CommandLineParser parser = new PosixParser();
    CommandLine cmd = parser.parse(options, args);

    if (!(cmd.hasOption('p') || cmd.hasOption('P'))) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("CamusJob.java", options);
      return 1;
    }

    if (cmd.hasOption('p'))
        props.load(this.getClass().getClassLoader().getResourceAsStream(
                    cmd.getOptionValue('p')));

    if (cmd.hasOption('P')) {
      File file = new File(cmd.getOptionValue('P'));
      FileInputStream fStream = new FileInputStream(file);
      props.load(fStream);
    }

    props.putAll(cmd.getOptionProperties("D"));

    run();
    return 0;
  }

  // Temporarily adding all Kafka parameters here
  public static boolean getPostTrackingCountsToKafka(Job job) {
    return job.getConfiguration().getBoolean(POST_TRACKING_COUNTS_TO_KAFKA,
        true);
  }

  public static int getKafkaFetchRequestMinBytes(JobContext context) {
    return context.getConfiguration().getInt(KAFKA_FETCH_REQUEST_MIN_BYTES,
        1024);
  }

  public static int getKafkaFetchRequestMaxWait(JobContext job) {
    return job.getConfiguration()
        .getInt(KAFKA_FETCH_REQUEST_MAX_WAIT, 1000);
  }

  public static String getKafkaBrokers(JobContext job) {
    String brokers = job.getConfiguration().get(KAFKA_BROKERS);
    if (brokers == null) {
      brokers = job.getConfiguration().get(KAFKA_HOST_URL);
      if (brokers != null) {
        log.warn("The configuration properties " + KAFKA_HOST_URL + " and " +
          KAFKA_HOST_PORT + " are deprecated. Please switch to using " + KAFKA_BROKERS);
        return brokers + ":" + job.getConfiguration().getInt(KAFKA_HOST_PORT, 10251);
      }
    }
    return brokers;
  }

  public static int getKafkaFetchRequestCorrelationId(JobContext job) {
    return job.getConfiguration().getInt(
        KAFKA_FETCH_REQUEST_CORRELATION_ID, -1);
  }

  public static String getKafkaClientName(JobContext job) {
    return job.getConfiguration().get(KAFKA_CLIENT_NAME);
  }

  public static String getKafkaFetchRequestBufferSize(JobContext job) {
    return job.getConfiguration().get(KAFKA_FETCH_BUFFER_SIZE);
  }

  public static int getKafkaTimeoutValue(JobContext job) {
    int timeOut = job.getConfiguration().getInt(KAFKA_TIMEOUT_VALUE, 30000);
    return timeOut;
  }

  public static int getKafkaBufferSize(JobContext job) {
    return job.getConfiguration().getInt(KAFKA_FETCH_BUFFER_SIZE,
        1024 * 1024);
  }

  public static boolean getLog4jConfigure(JobContext job) {
    return job.getConfiguration().getBoolean(LOG4J_CONFIGURATION, false);
  }
}
TOP

Related Classes of com.linkedin.camus.etl.kafka.CamusJob

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.