Source Code of org.archive.mapred.TaskLogInputFormat$TaskLogSplit

package org.archive.mapred;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FilenameFilter;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayList;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.LineRecordReader;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.net.DNS;


/**
 * An InputFormat for {@link TaskLog} <code>userlogs</code>.
 * 
 * The number of splits is equal to the <code>numTasks</code> count passed
 * to {@link #getSplits(JobConf, int)}, usually the count of map tasks.  Per
 * split, the local logs associated with the configured jobid are streamed.
 * Records are a single log line with a key of form 'HOST:TASK_ID:LINE_NO'.
 * Note, if a split (host) does not have logs corresponding to the passed
 * jobid -- say, the host is new to the cluster -- then the task fails and is
 * scheduled elsewhere.  This can make for double-counting of logs if the new
 * host has already had its logs processed by a previous task.  This makes for
 * fuzzy analysis: good for figuring if errors or general rate of problems
 * but bad for precision reporting.
 * 
 * <p>Before use, client must set jobid and optionally the userlogs subdir
 * -- whether stdout stderr, or syslog -- and whether to look at map or
 * reduce task logs or at both. See {@link #setJobid(JobConf, int)},
 * {@link #setLogfilter(JobConf, org.apache.hadoop.mapred.TaskLog.LogFilter)},
 * and {@link #setWhichTaskLogs(JobConf,
 * org.apache.hadoop.mapred.TaskLogInputFormat.TaskLogs)}.
 *
 * <p>This is an amended version of the TaskLogInputFormat that is part of
 * hadoop-1199.
 * 
 * @author stack
 */
public class TaskLogInputFormat implements InputFormat {
  private final Log LOG = LogFactory.getLog(this.getClass().getName());
  
  /**
   * Used for formatting the id numbers
   * TODO: Replace with JobTracker reference.
   */
  private static NumberFormat idFormat = NumberFormat.getInstance();
  static {
    idFormat.setMinimumIntegerDigits(4);
    idFormat.setGroupingUsed(false);
  }




  private static ArchiveTaskLog.LogFilter logFilter =
      ArchiveTaskLog.LogFilter.SYSLOG;


  public static enum TaskLogs {MAP, REDUCE, BOTH}
  private static TaskLogs whichTaskLogs = TaskLogs.MAP;


  private static final String KEY_BASE = "mapred.inputformat.tasklog.";
  private static final String JOBID_KEY = KEY_BASE + "jobid";
  private static final String LOGFILTER_KEY = KEY_BASE + "logfilter";
  private static final String WHICHTASK_KEY = "KEY_BASE" + "task";


  public static void setJobid(final JobConf job, final int id) {
    job.setInt(JOBID_KEY, id);
  }


  public static int getJobid(final JobConf job) {
    return job.getInt(JOBID_KEY, -1);
  }


  public static void setLogfilter(final JobConf job,
        final ArchiveTaskLog.LogFilter lf) {
    job.set(LOGFILTER_KEY, lf);
  }


  public static ArchiveTaskLog.LogFilter getLogfilter(final JobConf job) {
    return (ArchiveTaskLog.LogFilter) job.getObject(LOGFILTER_KEY);
  }


  public static void setWhichTaskLogs(final JobConf job, final TaskLogs tl) {
    job.setObject(WHICHTASK_KEY, tl);
  }


  public static TaskLogs getWhichTaskLogs(final JobConf job) {
    return (TaskLogs) job.getObject(WHICHTASK_KEY);
  }


  public RecordReader getRecordReader(InputSplit split, JobConf job,
      Reporter reporter) throws IOException {
    final int jobid = getJobid(job);
    if (jobid <= 0) {
      throw new IOException("Set jobid");
    }


    // Filter for the userlogs directory.  Returns list of map, reduce, or both
    // map and reduce tasks.
    final FilenameFilter fnf = new FilenameFilter() {
      private final String prefix = "task_" +
          // JobTracker.idFormat.format(jobid) + "_" +
          // FIX: JobTracker.idFormat.format(jobid) + "_" +
          idFormat.format(jobid) + "_" +
          ((whichTaskLogs == TaskLogs.MAP)? "m":
            (whichTaskLogs == TaskLogs.REDUCE)? "r":
            "" /* Both map and reduce */);


      public boolean accept(File dir, String name) {
        return name.startsWith(prefix);
      }
    };


    File logDir = ArchiveTaskLog.LOG_DIR;
    if (logDir == null) {
      throw new IOException("Set hadoop.log.dir system property");
    }
    if (!logDir.exists()) {
      throw new FileNotFoundException(logDir.getAbsolutePath());
    }
    final File[] tds = logDir.listFiles(fnf);
    if (tds.length <= 0) {
      throw new FileNotFoundException("No log dirs found for jobid " + jobid);
    }
    
    // Finally, get this hosts's name to add to key.
    final String localHostname =
      DNS.getDefaultHost(job.get("mapred.tasktracker.dns.interface","default"),
          job.get("mapred.tasktracker.dns.nameserver","default"));




    return new RecordReader() {
      private String hostname = localHostname;
      private File[] userlogsDirs = tds;
      private int userlogsDirsIndex = 0;
      private LineRecordReader lrr = null;
      private long accumulatingPosition = 0;
      private String currentTask = null;
      
      public void close() throws IOException {
        if (this.lrr != null) {
          this.lrr.close();
          this.lrr = null;
        }
      }


      public WritableComparable createKey() {
        return new Text();
      }


      public Writable createValue() {
        // Values are same as those made by LineRecordReader#createValue().
        return new Text();
      }


      public long getPos() throws IOException {
        return this.accumulatingPosition +
          ((this.lrr != null)? this.lrr.getPos(): 0);
      }


      public float getProgress() throws IOException {
        return (this.lrr == null)? 0.0f:
          ((this.userlogsDirsIndex - 1 + this.lrr.getProgress()) /
            (float)this.userlogsDirs.length);
      }
      
      public boolean getNextLine(Writable key, Writable value)
      throws IOException {
        Writable lrrKey = this.lrr.createKey();
        boolean result = this.lrr.next(lrrKey, value);
        if (result) {
          // Amend key to include host and current task.
          ((Text)key).set(this.hostname + ":" + this.currentTask + ":" +
              ((LongWritable)lrrKey).toString());
        }
        return result;
      }


      public boolean next(Writable key, Writable value) throws IOException {
        if (this.lrr != null) {
          if (getNextLine(key, value)) {
            return true;
          }
          // Else, no more lines in this LineRecordReader. Close
          // and try and get another.
          this.accumulatingPosition += this.lrr.getPos();
          this.lrr.close();
          this.lrr = null;
        }
        if (this.userlogsDirsIndex >= this.userlogsDirs.length) {
          // There are no more userlogs dirs. We are done.
          return false;
        }
        this.currentTask =
            this.userlogsDirs[this.userlogsDirsIndex++].getName();
        // For now, hardcoded to read from syslog.
        ArchiveTaskLog.Reader tlr =
            new ArchiveTaskLog.Reader(this.currentTask);
        this.lrr = new LineRecordReader(tlr.getInputStream(), 0,
          tlr.getTotalLogSize());
        return getNextLine(key, value);
      }
    };
  }


  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    ArrayList<InputSplit> is = new ArrayList<InputSplit>(numSplits);
    for (int i = 0; i < numSplits; i++) {
      is.add(new TaskLogSplit());
    }
    return is.toArray(new InputSplit[is.size()]);
  }


  public void validateInput(JobConf job) throws IOException {
    // Nothing to validate.
  }


  public static class TaskLogSplit implements InputSplit {
    public TaskLogSplit() {
      super();
    }


    public long getLength() throws IOException {
      // Return '1' for '1' host's logs.
      return 1;
    }


    public String[] getLocations() throws IOException {
      return new String[0];
    }


    public void readFields(DataInput in) throws IOException {
      // Nothing to serialize.
    }


    public void write(DataOutput out) throws IOException {
      // Nothing to serialize.
    }
  }


  /**
   * Runs a mapreduce job that uses {@link TaskLogInputFormat} reading
   * {@link TaskLog} userlog directories.
   */
  public static void main(String[] args) throws IOException {
    if (args.length != 2) {
      System.err.println("Usage: TaskLogInputFormat <output> <jobid>");
      System.exit(1);
    }


    JobConf job = new JobConf(TaskLogInputFormat.class);


    job.setInputFormat(TaskLogInputFormat.class);
    TaskLogInputFormat.setJobid(job, Integer.parseInt(args[1]));
    TaskLogInputFormat.setLogfilter(job, ArchiveTaskLog.LogFilter.SYSLOG);
    TaskLogInputFormat.setWhichTaskLogs(job, TaskLogInputFormat.TaskLogs.MAP);
    job.setOutputPath(new Path(args[0]));
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);


    JobClient.runJob(job);
  }
}
Source Code of org.archive.mapred.TaskLogInputFormat$TaskLogSplit

Related Classes of org.archive.mapred.TaskLogInputFormat$TaskLogSplit