Package eu.stratosphere.nephele.jobmanager

Source Code of eu.stratosphere.nephele.jobmanager.JobManager

/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/

package eu.stratosphere.nephele.jobmanager;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import eu.stratosphere.nephele.managementgraph.ManagementVertexID;
import eu.stratosphere.nephele.taskmanager.TaskKillResult;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;

import eu.stratosphere.configuration.ConfigConstants;
import eu.stratosphere.configuration.Configuration;
import eu.stratosphere.configuration.GlobalConfiguration;
import eu.stratosphere.core.io.StringRecord;
import eu.stratosphere.nephele.client.AbstractJobResult;
import eu.stratosphere.nephele.client.AbstractJobResult.ReturnCode;
import eu.stratosphere.nephele.client.JobCancelResult;
import eu.stratosphere.nephele.client.JobProgressResult;
import eu.stratosphere.nephele.client.JobSubmissionResult;
import eu.stratosphere.nephele.deployment.TaskDeploymentDescriptor;
import eu.stratosphere.nephele.event.job.AbstractEvent;
import eu.stratosphere.nephele.event.job.RecentJobEvent;
import eu.stratosphere.nephele.execution.ExecutionState;
import eu.stratosphere.nephele.execution.librarycache.LibraryCacheManager;
import eu.stratosphere.nephele.executiongraph.ExecutionEdge;
import eu.stratosphere.nephele.executiongraph.ExecutionGraph;
import eu.stratosphere.nephele.executiongraph.ExecutionGraphIterator;
import eu.stratosphere.nephele.executiongraph.ExecutionVertex;
import eu.stratosphere.nephele.executiongraph.ExecutionVertexID;
import eu.stratosphere.nephele.executiongraph.GraphConversionException;
import eu.stratosphere.nephele.executiongraph.InternalJobStatus;
import eu.stratosphere.nephele.executiongraph.JobStatusListener;
import eu.stratosphere.nephele.instance.AbstractInstance;
import eu.stratosphere.nephele.instance.DummyInstance;
import eu.stratosphere.nephele.instance.HardwareDescription;
import eu.stratosphere.nephele.instance.InstanceConnectionInfo;
import eu.stratosphere.nephele.instance.InstanceManager;
import eu.stratosphere.nephele.instance.InstanceType;
import eu.stratosphere.nephele.instance.InstanceTypeDescription;
import eu.stratosphere.nephele.instance.local.LocalInstanceManager;
import eu.stratosphere.runtime.io.channels.ChannelID;
import eu.stratosphere.nephele.ipc.RPC;
import eu.stratosphere.nephele.ipc.Server;
import eu.stratosphere.nephele.jobgraph.AbstractJobVertex;
import eu.stratosphere.nephele.jobgraph.JobGraph;
import eu.stratosphere.nephele.jobgraph.JobID;
import eu.stratosphere.nephele.jobmanager.accumulators.AccumulatorManager;
import eu.stratosphere.nephele.jobmanager.archive.ArchiveListener;
import eu.stratosphere.nephele.jobmanager.archive.MemoryArchivist;
import eu.stratosphere.nephele.jobmanager.scheduler.AbstractScheduler;
import eu.stratosphere.nephele.jobmanager.scheduler.SchedulingException;
import eu.stratosphere.nephele.jobmanager.splitassigner.InputSplitManager;
import eu.stratosphere.nephele.jobmanager.splitassigner.InputSplitWrapper;
import eu.stratosphere.nephele.jobmanager.web.WebInfoServer;
import eu.stratosphere.nephele.managementgraph.ManagementGraph;
import eu.stratosphere.nephele.profiling.JobManagerProfiler;
import eu.stratosphere.nephele.profiling.ProfilingUtils;
import eu.stratosphere.nephele.protocols.AccumulatorProtocol;
import eu.stratosphere.nephele.protocols.ChannelLookupProtocol;
import eu.stratosphere.nephele.protocols.ExtendedManagementProtocol;
import eu.stratosphere.nephele.protocols.InputSplitProviderProtocol;
import eu.stratosphere.nephele.protocols.JobManagerProtocol;
import eu.stratosphere.nephele.services.accumulators.AccumulatorEvent;
import eu.stratosphere.nephele.taskmanager.AbstractTaskResult;
import eu.stratosphere.nephele.taskmanager.TaskCancelResult;
import eu.stratosphere.nephele.taskmanager.TaskExecutionState;
import eu.stratosphere.nephele.taskmanager.TaskSubmissionResult;
import eu.stratosphere.runtime.io.network.ConnectionInfoLookupResponse;
import eu.stratosphere.runtime.io.network.RemoteReceiver;
import eu.stratosphere.nephele.taskmanager.ExecutorThreadFactory;
import eu.stratosphere.nephele.topology.NetworkTopology;
import eu.stratosphere.nephele.types.IntegerRecord;
import eu.stratosphere.nephele.util.SerializableArrayList;
import eu.stratosphere.util.StringUtils;

/**
* In Nephele the job manager is the central component for communication with clients, creating
* schedules for incoming jobs and supervise their execution. A job manager may only exist once in
* the system and its address must be known the clients.
* Task managers can discover the job manager by means of an UDP broadcast and afterwards advertise
* themselves as new workers for tasks.
*
*/
public class JobManager implements DeploymentManager, ExtendedManagementProtocol, InputSplitProviderProtocol,
    JobManagerProtocol, ChannelLookupProtocol, JobStatusListener, AccumulatorProtocol
{
  public static enum ExecutionMode { LOCAL, CLUSTER }
 
  // --------------------------------------------------------------------------------------------

  private static final Log LOG = LogFactory.getLog(JobManager.class);

  private final Server jobManagerServer;

  private final JobManagerProfiler profiler;

  private final EventCollector eventCollector;
 
  private final ArchiveListener archive;

  private final InputSplitManager inputSplitManager;

  private final AbstractScheduler scheduler;
 
  private AccumulatorManager accumulatorManager;

  private InstanceManager instanceManager;

  private final int recommendedClientPollingInterval;

  private final ExecutorService executorService = Executors.newCachedThreadPool(ExecutorThreadFactory.INSTANCE);

  private final static int FAILURE_RETURN_CODE = 1;

  private final AtomicBoolean isShutdownInProgress = new AtomicBoolean(false);

  private volatile boolean isShutDown;
 
  private WebInfoServer server;
 
 
  public JobManager(ExecutionMode executionMode) throws Exception {

    final String ipcAddressString = GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null);

    InetAddress ipcAddress = null;
    if (ipcAddressString != null) {
      try {
        ipcAddress = InetAddress.getByName(ipcAddressString);
      } catch (UnknownHostException e) {
        throw new Exception("Cannot convert " + ipcAddressString + " to an IP address: " + e.getMessage(), e);
      }
    }

    final int ipcPort = GlobalConfiguration.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY,
      ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT);

    // Read the suggested client polling interval
    this.recommendedClientPollingInterval = GlobalConfiguration.getInteger(
      ConfigConstants.JOBCLIENT_POLLING_INTERVAL_KEY, ConfigConstants.DEFAULT_JOBCLIENT_POLLING_INTERVAL);

    // Load the job progress collector
    this.eventCollector = new EventCollector(this.recommendedClientPollingInterval);
   
    // Register simple job archive
    int archived_items = GlobalConfiguration.getInteger(
        ConfigConstants.JOB_MANAGER_WEB_ARCHIVE_COUNT, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_ARCHIVE_COUNT);
    if (archived_items > 0) {
      this.archive = new MemoryArchivist(archived_items);
      this.eventCollector.registerArchivist(archive);
    }
    else {
      this.archive = null;
    }
   
    // Create the accumulator manager, with same archiving limit as web
    // interface. We need to store the accumulators for at least one job.
    // Otherwise they might be deleted before the client requested the
    // accumulator results.
    this.accumulatorManager = new AccumulatorManager(Math.min(1, archived_items));

    // Load the input split manager
    this.inputSplitManager = new InputSplitManager();

    // Determine own RPC address
    final InetSocketAddress rpcServerAddress = new InetSocketAddress(ipcAddress, ipcPort);

    // Start job manager's IPC server
    try {
      final int handlerCount = GlobalConfiguration.getInteger(ConfigConstants.JOB_MANAGER_IPC_HANDLERS_KEY,
        ConfigConstants.DEFAULT_JOB_MANAGER_IPC_HANDLERS);
      this.jobManagerServer = RPC.getServer(this, rpcServerAddress.getHostName(), rpcServerAddress.getPort(), handlerCount);
      this.jobManagerServer.start();
    } catch (IOException e) {
      throw new Exception("Cannot start RPC server: " + e.getMessage(), e);
    }

    LOG.info("Starting job manager in " + executionMode + " mode");

    // Try to load the instance manager for the given execution mode
    // Try to load the scheduler for the given execution mode
    if (executionMode == ExecutionMode.LOCAL) {
      try {
        this.instanceManager = new LocalInstanceManager();
      } catch (Throwable t) {
        throw new Exception("Cannot instantiate local instance manager: " + t.getMessage(), t);
      }
    } else {
      final String instanceManagerClassName = JobManagerUtils.getInstanceManagerClassName(executionMode);
      LOG.info("Trying to load " + instanceManagerClassName + " as instance manager");
      this.instanceManager = JobManagerUtils.loadInstanceManager(instanceManagerClassName);
      if (this.instanceManager == null) {
        throw new Exception("Unable to load instance manager " + instanceManagerClassName);
      }
    }

    // Try to load the scheduler for the given execution mode
    final String schedulerClassName = JobManagerUtils.getSchedulerClassName(executionMode);
    LOG.info("Trying to load " + schedulerClassName + " as scheduler");

    // Try to get the instance manager class name
    this.scheduler = JobManagerUtils.loadScheduler(schedulerClassName, this, this.instanceManager);
    if (this.scheduler == null) {
      throw new Exception("Unable to load scheduler " + schedulerClassName);
    }

    // Load profiler if it should be used
    if (GlobalConfiguration.getBoolean(ProfilingUtils.ENABLE_PROFILING_KEY, false)) {
      final String profilerClassName = GlobalConfiguration.getString(ProfilingUtils.JOBMANAGER_CLASSNAME_KEY,
        "eu.stratosphere.nephele.profiling.impl.JobManagerProfilerImpl");
      this.profiler = ProfilingUtils.loadJobManagerProfiler(profilerClassName, ipcAddress);
      if (this.profiler == null) {
        throw new Exception("Cannot load profiler");
      }
    } else {
      this.profiler = null;
      LOG.debug("Profiler disabled");
    }
  }

  public void shutdown() {

    if (!this.isShutdownInProgress.compareAndSet(false, true)) {
      return;
    }

    // Stop instance manager
    if (this.instanceManager != null) {
      this.instanceManager.shutdown();
    }

    // Stop profiling if enabled
    if (this.profiler != null) {
      this.profiler.shutdown();
    }

    // Stop RPC server
    if (this.jobManagerServer != null) {
      this.jobManagerServer.stop();
    }

    // Stop the executor service
    if (this.executorService != null) {
      this.executorService.shutdown();
      try {
        this.executorService.awaitTermination(5000L, TimeUnit.MILLISECONDS);
      } catch (InterruptedException e) {
        LOG.debug(e);
      }
    }

    // Stop and clean up the job progress collector
    if (this.eventCollector != null) {
      this.eventCollector.shutdown();
    }

    // Finally, shut down the scheduler
    if (this.scheduler != null) {
      this.scheduler.shutdown();
    }

    this.isShutDown = true;
    LOG.debug("Shutdown of job manager completed");
  }

  /**
   * Log Stratosphere version information.
   */
  private static void logVersionInformation() {
    String version = JobManager.class.getPackage().getImplementationVersion();
    // if version == null, then the JobManager runs from inside the IDE (or somehow not from the maven build jar)
    String revision = "<unknown>";
    try {
      Properties properties = new Properties();
      InputStream propFile = JobManager.class.getClassLoader().getResourceAsStream(".version.properties");
      if (propFile != null) {
        properties.load(propFile);
        revision = properties.getProperty("git.commit.id.abbrev");
      }
    } catch (IOException e) {
      LOG.info("Cannot determine code revision. Unable ro read version property file.");
    }
    LOG.info("Starting Stratosphere JobManager (Version: " + version + ", Rev:" + revision + ")");
  }
 
  /**
   * Entry point for the program
   *
   * @param args
   *        arguments from the command line
   */
 
  public static void main(String[] args) {
    // determine if a valid log4j config exists and initialize a default logger if not
    if (System.getProperty("log4j.configuration") == null) {
      Logger root = Logger.getRootLogger();
      root.removeAllAppenders();
      PatternLayout layout = new PatternLayout("%d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n");
      ConsoleAppender appender = new ConsoleAppender(layout, "System.err");
      root.addAppender(appender);
      root.setLevel(Level.INFO);
    }
   
    JobManager jobManager;
    try {
      jobManager = initialize(args);
      // Start info server for jobmanager
      jobManager.startInfoServer();
    }
    catch (Exception e) {
      LOG.fatal(e.getMessage(), e);
      System.exit(FAILURE_RETURN_CODE);
    }
   
    // Clean up is triggered through a shutdown hook
    // freeze this thread to keep the JVM alive (the job manager threads are daemon threads)
    Object w = new Object();
    synchronized (w) {
      try {
        w.wait();
      } catch (InterruptedException e) {}
    }
  }
 
  @SuppressWarnings("static-access")
  public static JobManager initialize(String[] args) throws Exception {
    // output the version and revision information to the log
    logVersionInformation();
   
    final Option configDirOpt = OptionBuilder.withArgName("config directory").hasArg()
      .withDescription("Specify configuration directory.").create("configDir");

    final Option executionModeOpt = OptionBuilder.withArgName("execution mode").hasArg()
      .withDescription("Specify execution mode.").create("executionMode");

    final Options options = new Options();
    options.addOption(configDirOpt);
    options.addOption(executionModeOpt);

    CommandLineParser parser = new GnuParser();
    CommandLine line = null;
    try {
      line = parser.parse(options, args);
    } catch (ParseException e) {
      LOG.error("CLI Parsing failed. Reason: " + e.getMessage());
      System.exit(FAILURE_RETURN_CODE);
    }

    final String configDir = line.getOptionValue(configDirOpt.getOpt(), null);
    final String executionModeName = line.getOptionValue(executionModeOpt.getOpt(), "local");
   
    ExecutionMode executionMode = null;
    if ("local".equals(executionModeName)) {
      executionMode = ExecutionMode.LOCAL;
    } else if ("cluster".equals(executionModeName)) {
      executionMode = ExecutionMode.CLUSTER;
    } else {
      System.err.println("Unrecognized execution mode: " + executionModeName);
      System.exit(FAILURE_RETURN_CODE);
    }
   
    // First, try to load global configuration
    GlobalConfiguration.loadConfiguration(configDir);

    // Create a new job manager object
    JobManager jobManager = new JobManager(executionMode);
   
    // Set base dir for info server
    Configuration infoserverConfig = GlobalConfiguration.getConfiguration();
    if (configDir != null && new File(configDir).isDirectory()) {
      infoserverConfig.setString(ConfigConstants.STRATOSPHERE_BASE_DIR_PATH_KEY, configDir+"/..");
    }
    GlobalConfiguration.includeConfiguration(infoserverConfig);
    return jobManager;
  }


  @Override
  public JobSubmissionResult submitJob(JobGraph job) throws IOException {
    try {
      // First check if job is null
      if (job == null) {
        JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR,
          "Submitted job is null!");
        return result;
      }
 
      if (LOG.isDebugEnabled()) {
        LOG.debug("Submitted job " + job.getName() + " is not null");
      }
 
      // Check if any vertex of the graph has null edges
      AbstractJobVertex jv = job.findVertexWithNullEdges();
      if (jv != null) {
        JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, "Vertex "
          + jv.getName() + " has at least one null edge");
        return result;
      }
 
      if (LOG.isDebugEnabled()) {
        LOG.debug("Submitted job " + job.getName() + " has no null edges");
      }
 
      // Next, check if the graph is weakly connected
      if (!job.isWeaklyConnected()) {
        JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR,
          "Job graph is not weakly connected");
        return result;
      }
 
      if (LOG.isDebugEnabled()) {
        LOG.debug("The graph of job " + job.getName() + " is weakly connected");
      }
 
      // Check if job graph has cycles
      if (!job.isAcyclic()) {
        JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR,
          "Job graph is not a DAG");
        return result;
      }
 
      if (LOG.isDebugEnabled()) {
        LOG.debug("The graph of job " + job.getName() + " is acyclic");
      }
 
      // Check constrains on degree
      jv = job.areVertexDegreesCorrect();
      if (jv != null) {
        JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR,
          "Degree of vertex " + jv.getName() + " is incorrect");
        return result;
      }
 
      if (LOG.isDebugEnabled()) {
        LOG.debug("All vertices of job " + job.getName() + " have the correct degree");
      }
 
      if (!job.isInstanceDependencyChainAcyclic()) {
        JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR,
          "The dependency chain for instance sharing contains a cycle");
 
        return result;
      }
 
      if (LOG.isDebugEnabled()) {
        LOG.debug("The dependency chain for instance sharing is acyclic");
      }
 
      // Check if the job will be executed with profiling enabled
      boolean jobRunsWithProfiling = false;
      if (this.profiler != null && job.getJobConfiguration().getBoolean(ProfilingUtils.PROFILE_JOB_KEY, true)) {
        jobRunsWithProfiling = true;
      }
 
      // Try to create initial execution graph from job graph
      LOG.info("Creating initial execution graph from job graph " + job.getName());
      ExecutionGraph eg;
 
      try {
        eg = new ExecutionGraph(job, this.instanceManager);
      } catch (GraphConversionException e) {
        if (e.getCause() == null) {
          return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(e));
        } else {
          Throwable t = e.getCause();
          if (t instanceof FileNotFoundException) {
            return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, t.getMessage());
          } else {
            return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(t));
          }
        }
      }
 
      // Register job with the progress collector
      if (this.eventCollector != null) {
        this.eventCollector.registerJob(eg, jobRunsWithProfiling, System.currentTimeMillis());
      }
 
      // Check if profiling should be enabled for this job
      if (jobRunsWithProfiling) {
        this.profiler.registerProfilingJob(eg);
 
        if (this.eventCollector != null) {
          this.profiler.registerForProfilingData(eg.getJobID(), this.eventCollector);
        }
 
      }
 
      // Register job with the dynamic input split assigner
      this.inputSplitManager.registerJob(eg);
 
      // Register for updates on the job status
      eg.registerJobStatusListener(this);
 
      // Schedule job
      if (LOG.isInfoEnabled()) {
        LOG.info("Scheduling job " + job.getName());
      }
 
      try {
        this.scheduler.schedulJob(eg);
      } catch (SchedulingException e) {
        unregisterJob(eg);
        JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(e));
        return result;
      }
 
      // Return on success
      return new JobSubmissionResult(AbstractJobResult.ReturnCode.SUCCESS, null);
    }
    catch (Throwable t) {
      LOG.error("Job submission failed.", t);
      return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(t));
    }
  }
 

  public InstanceManager getInstanceManager() {
    return this.instanceManager;
  }

  /**
   * This method is a convenience method to unregister a job from all of
   * Nephele's monitoring, profiling and optimization components at once.
   * Currently, it is only being used to unregister from profiling (if activated).
   *
   * @param executionGraph
   *        the execution graph to remove from the job manager
   */
  private void unregisterJob(final ExecutionGraph executionGraph) {

    // Remove job from profiler (if activated)
    if (this.profiler != null
      && executionGraph.getJobConfiguration().getBoolean(ProfilingUtils.PROFILE_JOB_KEY, true)) {
      this.profiler.unregisterProfilingJob(executionGraph);

      if (this.eventCollector != null) {
        this.profiler.unregisterFromProfilingData(executionGraph.getJobID(), this.eventCollector);
      }
    }

    // Cancel all pending requests for instances
    this.instanceManager.cancelPendingRequests(executionGraph.getJobID()); // getJobID is final member, no
                                        // synchronization necessary

    // Remove job from input split manager
    if (this.inputSplitManager != null) {
      this.inputSplitManager.unregisterJob(executionGraph);
    }

    // Unregister job with library cache manager
    try {
      LibraryCacheManager.unregister(executionGraph.getJobID());
    } catch (IOException ioe) {
      if (LOG.isWarnEnabled()) {
        LOG.warn(ioe);
      }
    }
  }


  @Override
  public void sendHeartbeat(final InstanceConnectionInfo instanceConnectionInfo,
      final HardwareDescription hardwareDescription) {

    // Delegate call to instance manager
    if (this.instanceManager != null) {

      final Runnable heartBeatRunnable = new Runnable() {

        @Override
        public void run() {
          instanceManager.reportHeartBeat(instanceConnectionInfo, hardwareDescription);
        }
      };

      this.executorService.execute(heartBeatRunnable);
    }
  }


  @Override
  public void updateTaskExecutionState(final TaskExecutionState executionState) throws IOException {

    // Ignore calls with executionResult == null
    if (executionState == null) {
      LOG.error("Received call to updateTaskExecutionState with executionState == null");
      return;
    }

    if (executionState.getExecutionState() == ExecutionState.FAILED) {
      LOG.error(executionState.getDescription());
    }

    final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(executionState.getJobID());
    if (eg == null) {
      LOG.error("Cannot find execution graph for ID " + executionState.getJobID() + " to change state to "
        + executionState.getExecutionState());
      return;
    }

    final ExecutionVertex vertex = eg.getVertexByID(executionState.getID());
    if (vertex == null) {
      LOG.error("Cannot find vertex with ID " + executionState.getID() + " of job " + eg.getJobID()
        + " to change state to " + executionState.getExecutionState());
      return;
    }

    // Asynchronously update execute state of vertex
    vertex.updateExecutionStateAsynchronously(executionState.getExecutionState(), executionState.getDescription());
  }


  @Override
  public JobCancelResult cancelJob(final JobID jobID) throws IOException {

    LOG.info("Trying to cancel job with ID " + jobID);

    final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID);
    if (eg == null) {
      return new JobCancelResult(ReturnCode.ERROR, "Cannot find job with ID " + jobID);
    }

    final Runnable cancelJobRunnable = new Runnable() {

      @Override
      public void run() {
        eg.updateJobStatus(InternalJobStatus.CANCELING, "Job canceled by user");
        final TaskCancelResult cancelResult = cancelJob(eg);
        if (cancelResult != null) {
          LOG.error(cancelResult.getDescription());
        }
      }
    };

    eg.executeCommand(cancelJobRunnable);

    LOG.info("Cancel of job " + jobID + " successfully triggered");

    return new JobCancelResult(AbstractJobResult.ReturnCode.SUCCESS, null);
  }

  /**
   * Cancels all the tasks in the current and upper stages of the
   * given execution graph.
   *
   * @param eg
   *        the execution graph representing the job to cancel.
   * @return <code>null</code> if no error occurred during the cancel attempt,
   *         otherwise the returned object will describe the error
   */
  private TaskCancelResult cancelJob(final ExecutionGraph eg) {

    TaskCancelResult errorResult = null;

    /**
     * Cancel all nodes in the current and upper execution stages.
     */
    final Iterator<ExecutionVertex> it = new ExecutionGraphIterator(eg, eg.getIndexOfCurrentExecutionStage(),
      false, true);
    while (it.hasNext()) {

      final ExecutionVertex vertex = it.next();
      final TaskCancelResult result = vertex.cancelTask();
      if (result.getReturnCode() != AbstractTaskResult.ReturnCode.SUCCESS) {
        errorResult = result;
      }
    }

    return errorResult;
  }


  @Override
  public JobProgressResult getJobProgress(final JobID jobID) throws IOException {

    if (this.eventCollector == null) {
      return new JobProgressResult(ReturnCode.ERROR, "JobManager does not support progress reports for jobs",
        null);
    }

    final SerializableArrayList<AbstractEvent> eventList = new SerializableArrayList<AbstractEvent>();
    this.eventCollector.getEventsForJob(jobID, eventList, false);

    return new JobProgressResult(ReturnCode.SUCCESS, null, eventList);
  }


  @Override
  public ConnectionInfoLookupResponse lookupConnectionInfo(InstanceConnectionInfo caller, JobID jobID, ChannelID sourceChannelID) {

    final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID);
    if (eg == null) {
      LOG.error("Cannot find execution graph to job ID " + jobID);
      return ConnectionInfoLookupResponse.createReceiverNotFound();
    }

    final InternalJobStatus jobStatus = eg.getJobStatus();
    if (jobStatus == InternalJobStatus.FAILING || jobStatus == InternalJobStatus.CANCELING) {
      return ConnectionInfoLookupResponse.createJobIsAborting();
    }

    final ExecutionEdge edge = eg.getEdgeByID(sourceChannelID);
    if (edge == null) {
      LOG.error("Cannot find execution edge associated with ID " + sourceChannelID);
      return ConnectionInfoLookupResponse.createReceiverNotFound();
    }

    if (sourceChannelID.equals(edge.getInputChannelID())) {
      // Request was sent from an input channel
      final ExecutionVertex connectedVertex = edge.getOutputGate().getVertex();

      final AbstractInstance assignedInstance = connectedVertex.getAllocatedResource().getInstance();
      if (assignedInstance == null) {
        LOG.error("Cannot resolve lookup: vertex found for channel ID " + edge.getOutputGateIndex()
          + " but no instance assigned");
        // LOG.info("Created receiverNotReady for " + connectedVertex + " 1");
        return ConnectionInfoLookupResponse.createReceiverNotReady();
      }

      // Check execution state
      final ExecutionState executionState = connectedVertex.getExecutionState();
      if (executionState == ExecutionState.FINISHED) {
        // that should not happen. if there is data pending, the receiver cannot be ready
        return ConnectionInfoLookupResponse.createReceiverNotFound();
      }

      // running is common, finishing is happens when the lookup is for the close event
      if (executionState != ExecutionState.RUNNING && executionState != ExecutionState.FINISHING) {
        // LOG.info("Created receiverNotReady for " + connectedVertex + " in state " + executionState + " 2");
        return ConnectionInfoLookupResponse.createReceiverNotReady();
      }

      if (assignedInstance.getInstanceConnectionInfo().equals(caller)) {
        // Receiver runs on the same task manager
        return ConnectionInfoLookupResponse.createReceiverFoundAndReady(edge.getOutputChannelID());
      } else {
        // Receiver runs on a different task manager
        final InstanceConnectionInfo ici = assignedInstance.getInstanceConnectionInfo();
        final InetSocketAddress isa = new InetSocketAddress(ici.address(), ici.dataPort());

        return ConnectionInfoLookupResponse.createReceiverFoundAndReady(new RemoteReceiver(isa, edge.getConnectionID()));
      }
    }
    // else, the request is for an output channel
    // Find vertex of connected input channel
    final ExecutionVertex targetVertex = edge.getInputGate().getVertex();

    // Check execution state
    final ExecutionState executionState = targetVertex.getExecutionState();

    // check whether the task needs to be deployed
    if (executionState != ExecutionState.RUNNING && executionState != ExecutionState.FINISHING && executionState != ExecutionState.FINISHED) {

      if (executionState == ExecutionState.ASSIGNED) {
        final Runnable command = new Runnable() {
          @Override
          public void run() {
            scheduler.deployAssignedVertices(targetVertex);
          }
        };
        eg.executeCommand(command);
      }

      // LOG.info("Created receiverNotReady for " + targetVertex + " in state " + executionState + " 3");
      return ConnectionInfoLookupResponse.createReceiverNotReady();
    }

    final AbstractInstance assignedInstance = targetVertex.getAllocatedResource().getInstance();
    if (assignedInstance == null) {
      LOG.error("Cannot resolve lookup: vertex found for channel ID " + edge.getInputChannelID() + " but no instance assigned");
      // LOG.info("Created receiverNotReady for " + targetVertex + " in state " + executionState + " 4");
      return ConnectionInfoLookupResponse.createReceiverNotReady();
    }

    if (assignedInstance.getInstanceConnectionInfo().equals(caller)) {
      // Receiver runs on the same task manager
      return ConnectionInfoLookupResponse.createReceiverFoundAndReady(edge.getInputChannelID());
    } else {
      // Receiver runs on a different task manager
      final InstanceConnectionInfo ici = assignedInstance.getInstanceConnectionInfo();
      final InetSocketAddress isa = new InetSocketAddress(ici.address(), ici.dataPort());

      return ConnectionInfoLookupResponse.createReceiverFoundAndReady(new RemoteReceiver(isa, edge.getConnectionID()));
    }
  }

  /**
   * Returns current ManagementGraph from eventCollector and, if not current, from archive
   *
   * {@inheritDoc}
   */
  @Override
  public ManagementGraph getManagementGraph(final JobID jobID) throws IOException {

    ManagementGraph mg = this.eventCollector.getManagementGraph(jobID);
    if (mg == null) {
      if(this.archive != null) {
        mg = this.archive.getManagementGraph(jobID);
      }
     
      if (mg == null) {
        throw new IOException("Cannot find job with ID " + jobID);
      }
    }

    return mg;
  }


  @Override
  public NetworkTopology getNetworkTopology(final JobID jobID) throws IOException {

    if (this.instanceManager != null) {
      return this.instanceManager.getNetworkTopology(jobID);
    }

    return null;
  }


  @Override
  public IntegerRecord getRecommendedPollingInterval() throws IOException {

    return new IntegerRecord(this.recommendedClientPollingInterval);
  }


  @Override
  public List<RecentJobEvent> getRecentJobs() throws IOException {

    final List<RecentJobEvent> eventList = new SerializableArrayList<RecentJobEvent>();

    if (this.eventCollector == null) {
      throw new IOException("No instance of the event collector found");
    }

    this.eventCollector.getRecentJobs(eventList);

    return eventList;
  }


  @Override
  public List<AbstractEvent> getEvents(final JobID jobID) throws IOException {

    final List<AbstractEvent> eventList = new SerializableArrayList<AbstractEvent>();

    if (this.eventCollector == null) {
      throw new IOException("No instance of the event collector found");
    }

    this.eventCollector.getEventsForJob(jobID, eventList, true);

    return eventList;
  }

  @Override
  public void killTask(final JobID jobID, final ManagementVertexID id) throws IOException {

    final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID);
    if (eg == null) {
      LOG.error("Cannot find execution graph for job " + jobID);
      return;
    }

    final ExecutionVertex vertex = eg.getVertexByID(ExecutionVertexID.fromManagementVertexID(id));
    if (vertex == null) {
      LOG.error("Cannot find execution vertex with ID " + id);
      return;
    }

    LOG.info("Killing task " + vertex + " of job " + jobID);

    final Runnable runnable = new Runnable() {

      @Override
      public void run() {

        final TaskKillResult result = vertex.killTask();
        if (result.getReturnCode() != AbstractTaskResult.ReturnCode.SUCCESS) {
          LOG.error(result.getDescription());
        }
      }
    };

    eg.executeCommand(runnable);
  }

  @Override
  public void killInstance(final StringRecord instanceName) throws IOException {

    final AbstractInstance instance = this.instanceManager.getInstanceByName(instanceName.toString());
    if (instance == null) {
      LOG.error("Cannot find instance with name " + instanceName + " to kill it");
      return;
    }

    LOG.info("Killing task manager on instance " + instance);

    final Runnable runnable = new Runnable() {

      @Override
      public void run() {
        try {
          instance.killTaskManager();
        } catch (IOException ioe) {
          LOG.error(ioe);
        }
      }
    };

    // Hand it over to the executor service
    this.executorService.execute(runnable);
  }

  /**
   * Tests whether the job manager has been shut down completely.
   *
   * @return <code>true</code> if the job manager has been shut down completely, <code>false</code> otherwise
   */
  public boolean isShutDown() {

    return this.isShutDown;
  }


  public Map<InstanceType, InstanceTypeDescription> getMapOfAvailableInstanceTypes() {

    // Delegate call to the instance manager
    if (this.instanceManager != null) {
      return this.instanceManager.getMapOfAvailableInstanceTypes();
    }

    return null;
  }


  @Override
  public void jobStatusHasChanged(final ExecutionGraph executionGraph, final InternalJobStatus newJobStatus,
      final String optionalMessage) {

    LOG.info("Status of job " + executionGraph.getJobName() + "(" + executionGraph.getJobID() + ")"
      + " changed to " + newJobStatus);

    if (newJobStatus == InternalJobStatus.FAILING) {

      // Cancel all remaining tasks
      cancelJob(executionGraph);
    }

    if (newJobStatus == InternalJobStatus.CANCELED || newJobStatus == InternalJobStatus.FAILED
      || newJobStatus == InternalJobStatus.FINISHED) {
      // Unregister job for Nephele's monitoring, optimization components, and dynamic input split assignment
      unregisterJob(executionGraph);
    }
  }


  @Override
  public void logBufferUtilization(final JobID jobID) throws IOException {

    final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID);
    if (eg == null) {
      return;
    }

    final Set<AbstractInstance> allocatedInstance = new HashSet<AbstractInstance>();

    final Iterator<ExecutionVertex> it = new ExecutionGraphIterator(eg, true);
    while (it.hasNext()) {

      final ExecutionVertex vertex = it.next();
      final ExecutionState state = vertex.getExecutionState();
      if (state == ExecutionState.RUNNING || state == ExecutionState.FINISHING) {
        final AbstractInstance instance = vertex.getAllocatedResource().getInstance();

        if (instance instanceof DummyInstance) {
          LOG.error("Found instance of type DummyInstance for vertex " + vertex.getName() + " (state "
            + state + ")");
          continue;
        }

        allocatedInstance.add(instance);
      }
    }

    // Send requests to task managers from separate thread
    final Runnable requestRunnable = new Runnable() {

      @Override
      public void run() {

        final Iterator<AbstractInstance> it2 = allocatedInstance.iterator();

        try {
          while (it2.hasNext()) {
            it2.next().logBufferUtilization();
          }
        } catch (IOException ioe) {
          LOG.error(ioe);
        }

      }
    };

    // Hand over to the executor service
    this.executorService.execute(requestRunnable);
  }


  @Override
  public void deploy(final JobID jobID, final AbstractInstance instance,
      final List<ExecutionVertex> verticesToBeDeployed) {

    if (verticesToBeDeployed.isEmpty()) {
      LOG.error("Method 'deploy' called but list of vertices to be deployed is empty");
      return;
    }

    for (final ExecutionVertex vertex : verticesToBeDeployed) {

      // Check vertex state
      if (vertex.getExecutionState() != ExecutionState.READY) {
        LOG.error("Expected vertex " + vertex + " to be in state READY but it is in state "
          + vertex.getExecutionState());
      }

      vertex.updateExecutionState(ExecutionState.STARTING, null);
    }

    // Create a new runnable and pass it the executor service
    final Runnable deploymentRunnable = new Runnable() {

      /**
       * {@inheritDoc}
       */
      @Override
      public void run() {

        // Check if all required libraries are available on the instance
        try {
          instance.checkLibraryAvailability(jobID);
        } catch (IOException ioe) {
          LOG.error("Cannot check library availability: " + StringUtils.stringifyException(ioe));
        }

        final List<TaskDeploymentDescriptor> submissionList = new SerializableArrayList<TaskDeploymentDescriptor>();

        // Check the consistency of the call
        for (final ExecutionVertex vertex : verticesToBeDeployed) {

          submissionList.add(vertex.constructDeploymentDescriptor());

          LOG.info("Starting task " + vertex + " on " + vertex.getAllocatedResource().getInstance());
        }

        List<TaskSubmissionResult> submissionResultList = null;

        try {
          submissionResultList = instance.submitTasks(submissionList);
        } catch (final IOException ioe) {
          final String errorMsg = StringUtils.stringifyException(ioe);
          for (final ExecutionVertex vertex : verticesToBeDeployed) {
            vertex.updateExecutionStateAsynchronously(ExecutionState.FAILED, errorMsg);
          }
        }

        if (verticesToBeDeployed.size() != submissionResultList.size()) {
          LOG.error("size of submission result list does not match size of list with vertices to be deployed");
        }

        int count = 0;
        for (final TaskSubmissionResult tsr : submissionResultList) {

          ExecutionVertex vertex = verticesToBeDeployed.get(count++);
          if (!vertex.getID().equals(tsr.getVertexID())) {
            LOG.error("Expected different order of objects in task result list");
            vertex = null;
            for (final ExecutionVertex candVertex : verticesToBeDeployed) {
              if (tsr.getVertexID().equals(candVertex.getID())) {
                vertex = candVertex;
                break;
              }
            }

            if (vertex == null) {
              LOG.error("Cannot find execution vertex for vertex ID " + tsr.getVertexID());
              continue;
            }
          }

          if (tsr.getReturnCode() != AbstractTaskResult.ReturnCode.SUCCESS) {
            // Change the execution state to failed and let the scheduler deal with the rest
            vertex.updateExecutionStateAsynchronously(ExecutionState.FAILED, tsr.getDescription());
          }
        }
      }
    };

    this.executorService.execute(deploymentRunnable);
  }


  @Override
  public InputSplitWrapper requestNextInputSplit(final JobID jobID, final ExecutionVertexID vertexID,
      final IntegerRecord sequenceNumber) throws IOException {

    final ExecutionGraph graph = this.scheduler.getExecutionGraphByID(jobID);
    if (graph == null) {
      LOG.error("Cannot find execution graph to job ID " + jobID);
      return null;
    }

    final ExecutionVertex vertex = graph.getVertexByID(vertexID);
    if (vertex == null) {
      LOG.error("Cannot find execution vertex for vertex ID " + vertexID);
      return null;
    }

    return new InputSplitWrapper(jobID, this.inputSplitManager.getNextInputSplit(vertex, sequenceNumber.getValue()));
  }
 
  /**
   * Starts the Jetty Infoserver for the Jobmanager
   *
   */
  public void startInfoServer() {
    final Configuration config = GlobalConfiguration.getConfiguration();
    // Start InfoServer
    try {
      int port = config.getInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_FRONTEND_PORT);
      server = new WebInfoServer(config, port, this);
      server.start();
    } catch (FileNotFoundException e) {
      LOG.error(e.getMessage(), e);
    } catch (Exception e) {
      LOG.error("Cannot instantiate info server: " + e.getMessage(), e);
    }
  }
 
 
  // TODO Add to RPC?
  public List<RecentJobEvent> getOldJobs() throws IOException {

    //final List<RecentJobEvent> eventList = new SerializableArrayList<RecentJobEvent>();

    if (this.archive == null) {
      throw new IOException("No instance of the event collector found");
    }

    //this.eventCollector.getRecentJobs(eventList);

    return this.archive.getJobs();
  }
 
  public ArchiveListener getArchive() {
    return this.archive;
  }

  public int getNumberOfTaskTrackers() {
    return this.instanceManager.getNumberOfTaskTrackers();
  }

  @Override
  public void reportAccumulatorResult(AccumulatorEvent accumulatorEvent) throws IOException {
    this.accumulatorManager.processIncomingAccumulators(accumulatorEvent.getJobID(),
        accumulatorEvent.getAccumulators());
  }

  @Override
  public AccumulatorEvent getAccumulatorResults(JobID jobID) throws IOException {
    return new AccumulatorEvent(jobID, this.accumulatorManager.getJobAccumulators(jobID), false);
  }
}
TOP

Related Classes of eu.stratosphere.nephele.jobmanager.JobManager

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.