Package org.goldenorb

Source Code of org.goldenorb.JobManager

/**
* Licensed to Ravel, Inc. under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  Ravel, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.goldenorb;

import java.io.IOException;
import java.util.ArrayList;

import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapred.InvalidJobConfException;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.ZooKeeper;
import org.goldenorb.conf.OrbConfigurable;
import org.goldenorb.conf.OrbConfiguration;
import org.goldenorb.event.OrbCallback;
import org.goldenorb.event.OrbEvent;
import org.goldenorb.event.OrbExceptionEvent;
import org.goldenorb.event.job.JobDeathEvent;
import org.goldenorb.jet.OrbTrackerMember;
import org.goldenorb.jet.PartitionRequest;
import org.goldenorb.util.ResourceAllocator;
import org.goldenorb.zookeeper.OrbZKFailure;
import org.goldenorb.zookeeper.ZookeeperUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* JobManager is created by the lead OrbTracker to aid in putting jobs-in-the-queue and pulling
* them into jobs-in-progress. It tells the OrbTrackerMembers in the LeaderGroup to launch their
* partitions. Then it watches the jobs (via heart beat field in ZooKeeper) to make sure they're
* still running,  
*
* @param <M> The class of type that JobManager will actually manage
*/
public class JobManager<M extends OrbTrackerMember> implements OrbConfigurable {
 
  private final Logger logger = LoggerFactory.getLogger(JobManager.class);
 
  private String basePath;
  private String jobQueuePath;
  private String jobsInProgressPath;
 
  private OrbCallback orbCallback;
  private OrbConfiguration orbConf;
  private ZooKeeper zk;
  private ResourceAllocator<M> resourceAllocator;
  private Collection<M> orbTrackerMembers;
 
  private JobsInQueueWatcher jobsInQueueWatcher = new JobsInQueueWatcher();
  private SortedMap<String,OrbJob> jobs = new TreeMap<String,OrbJob>();
  private Set<String> activeJobs = new HashSet<String>();
 
  private boolean activeManager = true;
 
/**
* Constructor
*
* @param  OrbCallback orbCallback
* @param  OrbConfiguration orbConf
* @param  ZooKeeper zk
* @param  ResourceAllocator<M> resourceAllocator
* @param  Collection<M> orbTrackers
*/
  public JobManager(OrbCallback orbCallback,
                    OrbConfiguration orbConf,
                    ZooKeeper zk,
                    ResourceAllocator<M> resourceAllocator,
                    Collection<M> orbTrackers) {
    logger.info("Initializing JobManager");
   
    this.orbConf = orbConf;
    this.orbCallback = orbCallback;
    this.zk = zk;
    this.resourceAllocator = resourceAllocator;
    this.orbTrackerMembers = orbTrackers;
   
    basePath = OrbTracker.ZK_BASE_PATH + "/" + orbConf.getOrbClusterName();
    jobQueuePath = basePath + "/JobQueue";
    jobsInProgressPath = basePath + "/JobsInProgress";
   
    buildJobManagerPaths();
    getJobsInQueue();
  }
 
/**
* Return the jobTries
*/
  public int getJobTries(String jobNumber) {
    synchronized (jobs) {
      if (jobs.containsKey(jobNumber)) {
        return jobs.get(jobNumber).getTries();
      } else {
        return -1;
      }
    }
  }
 
/**
* Return the obActive
*/
  public boolean isJobActive(String jobNumber) {
    synchronized (activeJobs) {
      return activeJobs.contains(jobNumber);
    }
  }
 
/**
* Return the jobsInQueue
*/
  private void getJobsInQueue() {
    logger.info("getting jobs in queue.");
    synchronized (jobs) {
      List<String> jobQueueChildren = null;
      try {
        jobQueueChildren = zk.getChildren(jobQueuePath, jobsInQueueWatcher);
      } catch (KeeperException e) {
        fireEvent(new OrbExceptionEvent(e));
      } catch (InterruptedException e) {
        fireEvent(new OrbExceptionEvent(e));
      }
      List<String> jobsToRemove = new ArrayList<String>();
      for (String jobPath : jobs.keySet()) {
        if (!jobQueueChildren.contains(jobPath)) {
          jobsToRemove.add(jobPath);
          // Either a job has completed or been removed by someone else this should fire an event.
          // This should really not occur since it should only be removed by the JobManager itself.
          // In reality does an event really even need to be thrown?
        }
      }
      for (String job : jobsToRemove) {
        logger.debug("Removing job: " + job);
        jobs.remove(job);
        activeJobs.remove(job);
      }
      for (String jobPath : jobQueueChildren) {
        OrbConfiguration jobConf;
        try {
          jobConf = (OrbConfiguration) ZookeeperUtils.getNodeWritable(zk, jobQueuePath + "/" + jobPath,
            OrbConfiguration.class, orbConf);
          if (jobConf != null) {
            if (!jobs.containsKey(jobPath)) {
              logger.debug("Adding job: " + jobPath);
              jobs.put(jobPath, new OrbJob(jobPath, jobConf));
              // Here we have a new job--once again an event should be fired.
              // Although I am not sure that an event really needs to be fired at this point. We will see.
            }
          } else {
            logger.debug("Job is not a valid job.");
          }
        } catch (OrbZKFailure e) {
          fireEvent(new OrbExceptionEvent(e));
        }
      }
    }
    tryToLaunchJob();
  }
 
/**
*
*/
  private void tryToLaunchJob() {
    synchronized (jobs) {
      if (!jobs.isEmpty()) {
        for (OrbJob job : jobs.values()) {
          logger.debug("Active Jobs: " + activeJobs);
          if (!activeJobs.contains(job.getJobNumber())) {
            if (resourcesAvailable(job)) {
              launchJob(job);
            }
          }
        }
      }
    }
  }
 
/**
*
* @param  OrbJob job
*/
  private void launchJob(OrbJob job) {
    try {
      ZookeeperUtils.notExistCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber());
      ZookeeperUtils.notExistCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber()
                                            + "/OrbPartitionLeaderGroup");
      ZookeeperUtils.notExistCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber() + "/messages");
      ZookeeperUtils.tryToCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber()
                                         + "/messages/heartbeat", new LongWritable(0), CreateMode.PERSISTENT);
     
      // allocate resources and if enough, start the job
      logger.info("checking for available OrbTracker resources");
      Map<M,Integer[]> assignments = null;
      try {
        assignments = resourceAllocator.assignResources(job.getOrbConf());
      } catch (InvalidJobConfException e) {
        logger.error(e.getMessage());
      }
      logger.info("Starting Job");
      logger.info("********** Job {} started: {}", job.getJobNumber(), new Date().getTime());
      if (assignments != null) {
        logger.info("Allocating partitions");
       
        int basePartitionID = 0;
        for (M tracker : orbTrackerMembers) {
          logger.debug("OrbTracker - " + tracker.getHostname() + ":" + tracker.getPort());
          Integer[] assignment = assignments.get(tracker);
          tracker.initProxy(getOrbConf());
         
          try {
            logger.debug("jobConf().getHDFSdistributedFiles(): {}", job.getOrbConf().getHDFSdistributedFiles());
            tracker.getRequiredFiles(job.getOrbConf());
          } catch (OrbZKFailure e) {
            logger.error("EXCEPTION : An OrbTrackerMember failed to copy files from HDFS to local machine");
            logger.error(e.getMessage());
            throw e;
          }
          PartitionRequest request = new PartitionRequest();
          request.setActivePartitions(assignment[ResourceAllocator.TRACKER_AVAILABLE]);
          request.setReservedPartitions(assignment[ResourceAllocator.TRACKER_RESERVED]);
          request.setJobID(job.getJobNumber());
          request.setBasePartitionID(basePartitionID);
         
          request.setJobConf(job.getOrbConf());
         
          basePartitionID += assignment[ResourceAllocator.TRACKER_AVAILABLE];
         
          logger.debug("requesting partitions");
          tracker.requestPartitions(request);
          logger.info(request.toString());
         
          JobStillActiveCheck jobStillActiveCheck = new JobStillActiveCheck(job);
          job.setJobStillActiveInterface(jobStillActiveCheck);
          new Thread(jobStillActiveCheck).start();
         
          activeJobs.add(job.getJobNumber());
          checkForDeathComplete(job);
          heartbeat(job);
        }
      } else {
        logger.error("not enough capacity for this job");
        jobComplete(job);
      }
    } catch (OrbZKFailure e) {
      e.printStackTrace();
      logger.error(e.getMessage());
      fireEvent(new OrbExceptionEvent(e));
    } //catch (IOException e) {
//      e.printStackTrace();
//      logger.error(e.getMessage());
//    }
  }
 
/**
*
* @param  OrbJob job
*/
  private void checkForDeathComplete(OrbJob job) throws OrbZKFailure {
    if (job.getDeathAndCompleteWatcher() == null) {
      job.setDeathAndCompleteWatcher(new DeathAndCompleteWatcher(job));
    }
    try {
      job.getDeathAndCompleteWatcher().restart();
      List<String> messages = zk.getChildren(jobsInProgressPath + "/" + job.getJobNumber() + "/messages",
        (Watcher) job.getDeathAndCompleteWatcher());
      if (messages.contains("death")) {
        jobDeath(job);
      }
      if (messages.contains("complete")) {
        jobComplete(job);
      }
    } catch (KeeperException e) {
      throw new OrbZKFailure(e);
    } catch (InterruptedException e) {
      throw new OrbZKFailure(e);
    }
   
  }
 
  public class DeathAndCompleteWatcher implements Killable, Watcher {
   
    private boolean active = true;
    private OrbJob job;
   
/**
* Constructor
*
* @param  OrbJob job
*/
    public DeathAndCompleteWatcher(OrbJob job) {
      logger.info("Creating DeathAndCompleteWatcher for: " + job.getJobNumber());
      this.job = job;
    }
   
/**
*
* @param  WatchedEvent event
*/
    @Override
    public void process(WatchedEvent event) {
      if (active && activeManager) {
        try {
          logger.debug("DeathAndCompleteWatcher processing event for: " + job.getJobNumber());
          checkForDeathComplete(job);
        } catch (OrbZKFailure e) {
          fireEvent(new OrbExceptionEvent(e));
        }
      }
    }
   
/**
*
*/
    public void kill() {
      active = false;
    }
   
/**
*
*/
    public void restart() {
      active = true;
    }
  }
 
  public class HeartbeatWatcher implements Killable, Watcher {
   
    private OrbJob job;
    private boolean active = true;
   
/**
* Constructor
*
* @param  OrbJob job
*/
    public HeartbeatWatcher(OrbJob job) {
      logger.debug("Creating HeartbeatWatcher for: " + job.getJobNumber());
      this.job = job;
    }
   
/**
*
* @param  WatchedEvent event
*/
    @Override
    public void process(WatchedEvent event) {
      if (active && activeManager) {
        try {
          logger.debug("HearbeatWatcher processing event for: " + job.getJobNumber());
          heartbeat(job);
        } catch (OrbZKFailure e) {
          fireEvent(new OrbExceptionEvent(e));
        }
      }
    }
   
/**
*
*/
    public void kill() {
      active = false;
    }
   
/**
*
*/
    public void restart() {
      active = true;
    }
  }
 
/**
*
* @param  OrbJob job
*/
  private void heartbeat(OrbJob job) throws OrbZKFailure {
    if (job.getHeartbeatWatcher() == null) {
      job.setHeartbeatWatcher(new HeartbeatWatcher(job));
    }
    job.getHeartbeatWatcher().restart();
    Long newHeartbeat = ((LongWritable) ZookeeperUtils.getNodeWritable(zk,
      jobsInProgressPath + "/" + job.getJobNumber() + "/messages/heartbeat", LongWritable.class, orbConf,
      (Watcher) job.getHeartbeatWatcher())).get();
    logger.debug("Getting new heartbeat for: " + job.getJobNumber() + " has new heartbeat: " + newHeartbeat);
    job.setHeartbeat(newHeartbeat);
  }
 
/**
*
* @param  OrbJob job
* @returns boolean
*/
  private boolean resourcesAvailable(OrbJob job) {
    // TODO what do we need to examine in order to actually check whether
    // resources are available?
    return true;
  }
 
/**
*
* @param  OrbJob job
*/
  private void removeJobFromQueue(OrbJob job) throws OrbZKFailure {
    ZookeeperUtils.deleteNodeIfEmpty(zk, jobQueuePath + "/" + job.getJobNumber());
  }
 
/**
*
* @param  OrbJob job
*/
  private void jobDeath(OrbJob job) throws OrbZKFailure {
    logger.info("jobDeath " + job.getJobNumber());
    synchronized (job) {
      fireEvent(new JobDeathEvent(job.getJobNumber()));
      job.getJobStillActiveInterface().kill();
      job.getDeathAndCompleteWatcher().kill();
      job.getHeartbeatWatcher().kill();
    }
   
    for(OrbTrackerMember orbTrackerMember: orbTrackerMembers){
      orbTrackerMember.killJob(job.getJobNumber());
    }
    logger.info("Shutting down partition instances");
    logger.info("Number of tries: " + job.getTries());
    if (job.getTries() > orbConf.getMaximumJobTries()) {
      ZookeeperUtils.recursiveDelete(zk, jobsInProgressPath + "/" + job.getJobNumber());
      ZookeeperUtils.deleteNodeIfEmpty(zk, jobsInProgressPath + "/" + job.getJobNumber());
      removeJobFromQueue(job);
    } else {
      ZookeeperUtils.recursiveDelete(zk, jobsInProgressPath + "/" + job.getJobNumber());
      ZookeeperUtils.deleteNodeIfEmpty(zk, jobsInProgressPath + "/" + job.getJobNumber());
      job.incrementTries();
      logger.info("Incrementing tries for: " + job.getJobNumber());
    }
    synchronized (activeJobs) {
      activeJobs.remove(job.getJobNumber());
      logger.info("Removing job: " + job.getJobNumber() + " from activeJobs.");
      // TODO tell the other OrbTrackers to kill the partitions associated with the job.
    }
    tryToLaunchJob();
  }
 
/**
*
* @param  OrbJob job
*/
  private void jobComplete(OrbJob job) throws OrbZKFailure {
    synchronized (job) {
      job.getJobStillActiveInterface().kill();
      job.getDeathAndCompleteWatcher().kill();
      job.getHeartbeatWatcher().kill();
    }
   
    for(OrbTrackerMember orbTrackerMember: orbTrackerMembers){
      orbTrackerMember.killJob(job.getJobNumber());
    }

    ZookeeperUtils.recursiveDelete(zk, jobsInProgressPath + "/" + job.getJobNumber());
    ZookeeperUtils.deleteNodeIfEmpty(zk, jobsInProgressPath + "/" + job.getJobNumber());
    removeJobFromQueue(job);
   
    logger.info("********** Job {} removed: {}", job.getJobNumber(), new Date().getTime());
    removeJobsHDFSfiles(job);
   
    tryToLaunchJob();
  }
 
  /**
   * Deletes the files in HDFS under the /Distributed directory for this job.
   * @param job
   */
  public void removeJobsHDFSfiles(OrbJob job) {
    Path[] hdfsPaths = job.getOrbConf().getHDFSdistributedFiles();
    if (hdfsPaths != null) {
      try {
        FileSystem fs = FileSystem.get(job.getOrbConf());
        for (Path path : hdfsPaths) {
          fs.delete(path, false);
          logger.info("Removed file " + path.toString() + " from HDFS because job " + job.getJobNumber() + " is complete");
        }
      } catch (IOException e) {
        logger.error("Exception occured while trying to remove files from HDFS for job " + job.getJobNumber());
        logger.error(e.getMessage());
        e.printStackTrace();
      }
     
    }
  }
 
  private class JobsInQueueWatcher implements Watcher {
/**
*
* @param  WatchedEvent event
*/
    @Override
    public void process(WatchedEvent event) {
      getJobsInQueue();
    }
  }
 
/**
*
* @param  OrbEvent orbEvent
*/
  private void fireEvent(OrbEvent orbEvent) {
    orbCallback.process(orbEvent);
  }
 
/**
*
*/
  private void buildJobManagerPaths() {
    try {
      ZookeeperUtils.notExistCreateNode(zk, jobQueuePath);
    } catch (OrbZKFailure e) {
      fireEvent(new OrbExceptionEvent(e));
    }
    try {
      ZookeeperUtils.notExistCreateNode(zk, jobsInProgressPath);
    } catch (OrbZKFailure e) {
      fireEvent(new OrbExceptionEvent(e));
    }
  }
 
  public class JobStillActiveCheck implements Killable, Runnable {
   
    private OrbJob job;
    private boolean active;
    private Long lastHeartbeat = -1L;
   
/**
* Constructor
*
* @param  OrbJob job
*/
    public JobStillActiveCheck(OrbJob job) {
      logger.info("Creating JobStillActiveChecker for: " + job.getJobNumber());
      this.job = job;
      active = true;
    }
   
/**
*
*/
    @Override
    public void run() {
      synchronized (this) {
        while (active && activeManager) {
          try {
            wait(orbConf.getJobHeartbeatTimeout());
          } catch (InterruptedException e) {
            fireEvent(new OrbExceptionEvent(e));
          }
          logger.debug("Checking heartbeat for: " + job.getJobNumber() + " Heartbeat is: "
                       + job.getHeartbeat());
          if (job.getHeartbeat() <= lastHeartbeat) {
            try {
              jobDeath(job);
            } catch (OrbZKFailure e) {
              fireEvent(new OrbExceptionEvent(e));
            }
          }
          lastHeartbeat = job.getHeartbeat();
        }
      }
    }
   
/**
* Return the ctive
*/
    public boolean isActive() {
      return active;
    }
   
/**
* Set the active
* @param  boolean active
*/
    public void setActive(boolean active) {
      this.active = active;
    }
   
/**
*
*/
    @Override
    public void kill() {
      active = false;
    }
   
/**
*
*/
    @Override
    public void restart() {
      active = true;
    }
  }
 
/**
* Set the orbConf
* @param  OrbConfiguration orbConf
*/
  @Override
  public void setOrbConf(OrbConfiguration orbConf) {
    this.orbConf = orbConf;
  }
 
/**
* Return the orbConf
*/
  @Override
  public OrbConfiguration getOrbConf() {
    return orbConf;
  }
 
/**
*
*/
  public void shutdown() {
    activeManager = false;
  }
 
}
TOP

Related Classes of org.goldenorb.JobManager

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.