Package org.apache.tez.dag.app

Source Code of org.apache.tez.dag.app.RecoveryParser$RecoveredDAGData

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.tez.dag.app;

import java.io.EOFException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.tez.common.TezCommonUtils;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.TezConstants;
import org.apache.tez.dag.app.dag.DAGState;
import org.apache.tez.dag.app.dag.Task;
import org.apache.tez.dag.app.dag.Vertex;
import org.apache.tez.dag.app.dag.impl.DAGImpl;
import org.apache.tez.dag.history.HistoryEvent;
import org.apache.tez.dag.history.HistoryEventType;
import org.apache.tez.dag.history.events.AMLaunchedEvent;
import org.apache.tez.dag.history.events.AMStartedEvent;
import org.apache.tez.dag.history.events.ContainerLaunchedEvent;
import org.apache.tez.dag.history.events.ContainerStoppedEvent;
import org.apache.tez.dag.history.events.DAGCommitStartedEvent;
import org.apache.tez.dag.history.events.DAGFinishedEvent;
import org.apache.tez.dag.history.events.DAGInitializedEvent;
import org.apache.tez.dag.history.events.DAGStartedEvent;
import org.apache.tez.dag.history.events.DAGSubmittedEvent;
import org.apache.tez.dag.history.events.TaskAttemptFinishedEvent;
import org.apache.tez.dag.history.events.TaskAttemptStartedEvent;
import org.apache.tez.dag.history.events.TaskFinishedEvent;
import org.apache.tez.dag.history.events.TaskStartedEvent;
import org.apache.tez.dag.history.events.VertexCommitStartedEvent;
import org.apache.tez.dag.history.events.VertexDataMovementEventsGeneratedEvent;
import org.apache.tez.dag.history.events.VertexFinishedEvent;
import org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent;
import org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent;
import org.apache.tez.dag.history.events.VertexInitializedEvent;
import org.apache.tez.dag.history.events.VertexParallelismUpdatedEvent;
import org.apache.tez.dag.history.events.VertexStartedEvent;
import org.apache.tez.dag.history.recovery.RecoveryService;
import org.apache.tez.dag.records.TezDAGID;
import org.apache.tez.dag.records.TezVertexID;
import org.apache.tez.dag.recovery.records.RecoveryProtos;
import org.apache.tez.dag.recovery.records.RecoveryProtos.SummaryEventProto;

import com.google.common.annotations.VisibleForTesting;

public class RecoveryParser {

  private static final Log LOG = LogFactory.getLog(RecoveryParser.class);

  private final DAGAppMaster dagAppMaster;
  private final FileSystem recoveryFS;
  private final Path recoveryDataDir;
  private final Path currentAttemptRecoveryDataDir;
  private final int recoveryBufferSize;
  private final int currentAttemptId;

  private static final String dataRecoveredFileFlag = "dataRecovered";

  public RecoveryParser(DAGAppMaster dagAppMaster,
      FileSystem recoveryFS,
      Path recoveryDataDir,
      int currentAttemptId) throws IOException {
    this.dagAppMaster = dagAppMaster;
    this.recoveryFS = recoveryFS;
    this.recoveryDataDir = recoveryDataDir;
    this.currentAttemptId = currentAttemptId;
    this.currentAttemptRecoveryDataDir = TezCommonUtils.getAttemptRecoveryPath(recoveryDataDir,
        currentAttemptId);
    recoveryBufferSize = dagAppMaster.getConfig().getInt(
        TezConfiguration.DAG_RECOVERY_FILE_IO_BUFFER_SIZE,
        TezConfiguration.DAG_RECOVERY_FILE_IO_BUFFER_SIZE_DEFAULT);
    this.recoveryFS.mkdirs(currentAttemptRecoveryDataDir);
  }

  public static class RecoveredDAGData {
    public TezDAGID recoveredDagID = null;
    public DAGImpl recoveredDAG = null;
    public DAGState dagState = null;
    public boolean isCompleted = false;
    public boolean nonRecoverable = false;
    public String reason = null;
    public Map<String, LocalResource> cumulativeAdditionalResources = null;
  }

  private static void parseSummaryFile(FSDataInputStream inputStream)
      throws IOException {
    while (true) {
      RecoveryProtos.SummaryEventProto proto =
          RecoveryProtos.SummaryEventProto.parseDelimitedFrom(inputStream);
      if (proto == null) {
        LOG.info("Reached end of summary stream");
        break;
      }
      LOG.info("[SUMMARY]"
          + " dagId=" + proto.getDagId()
          + ", timestamp=" + proto.getTimestamp()
          + ", event=" + HistoryEventType.values()[proto.getEventType()]);
    }
  }

  private static HistoryEvent getNextEvent(FSDataInputStream inputStream)
      throws IOException {
    int eventTypeOrdinal = -1;
    try {
      eventTypeOrdinal = inputStream.readInt();
    } catch (EOFException eof) {
      return null;
    }
    if (eventTypeOrdinal < 0 || eventTypeOrdinal >=
        HistoryEventType.values().length) {
      // Corrupt data
      // reached end
      throw new IOException("Corrupt data found when trying to read next event type"
          + ", eventTypeOrdinal=" + eventTypeOrdinal);
    }
    HistoryEventType eventType = HistoryEventType.values()[eventTypeOrdinal];
    HistoryEvent event;
    switch (eventType) {
      case AM_LAUNCHED:
        event = new AMLaunchedEvent();
        break;
      case AM_STARTED:
        event = new AMStartedEvent();
        break;
      case DAG_SUBMITTED:
        event = new DAGSubmittedEvent();
        break;
      case DAG_INITIALIZED:
        event = new DAGInitializedEvent();
        break;
      case DAG_STARTED:
        event = new DAGStartedEvent();
        break;
      case DAG_COMMIT_STARTED:
        event = new DAGCommitStartedEvent();
        break;
      case DAG_FINISHED:
        event = new DAGFinishedEvent();
        break;
      case CONTAINER_LAUNCHED:
        event = new ContainerLaunchedEvent();
        break;
      case CONTAINER_STOPPED:
        event = new ContainerStoppedEvent();
        break;
      case VERTEX_INITIALIZED:
        event = new VertexInitializedEvent();
        break;
      case VERTEX_STARTED:
        event = new VertexStartedEvent();
        break;
      case VERTEX_PARALLELISM_UPDATED:
        event = new VertexParallelismUpdatedEvent();
        break;
      case VERTEX_COMMIT_STARTED:
        event = new VertexCommitStartedEvent();
        break;
      case VERTEX_GROUP_COMMIT_STARTED:
        event = new VertexGroupCommitStartedEvent();
        break;
      case VERTEX_GROUP_COMMIT_FINISHED:
        event = new VertexGroupCommitFinishedEvent();
        break;
      case VERTEX_FINISHED:
        event = new VertexFinishedEvent();
        break;
      case TASK_STARTED:
        event = new TaskStartedEvent();
        break;
      case TASK_FINISHED:
        event = new TaskFinishedEvent();
        break;
      case TASK_ATTEMPT_STARTED:
        event = new TaskAttemptStartedEvent();
        break;
      case TASK_ATTEMPT_FINISHED:
        event = new TaskAttemptFinishedEvent();
        break;
      case VERTEX_DATA_MOVEMENT_EVENTS_GENERATED:
        event = new VertexDataMovementEventsGeneratedEvent();
        break;
      default:
        throw new IOException("Invalid data found, unknown event type "
            + eventType);

    }
    if (LOG.isDebugEnabled()) {
      LOG.debug("Parsing event from input stream"
          + ", eventType=" + eventType);
    }
    try {
      event.fromProtoStream(inputStream);
    } catch (EOFException eof) {
      return null;
    }
    if (LOG.isDebugEnabled()) {
      LOG.debug("Parsed event from input stream"
          + ", eventType=" + eventType
          + ", event=" + event.toString());
    }
    return event;
  }





  private static void parseDAGRecoveryFile(FSDataInputStream inputStream)
      throws IOException {
    while (true) {
      HistoryEvent historyEvent = getNextEvent(inputStream);
      if (historyEvent == null) {
        LOG.info("Reached end of stream");
        break;
      }
      LOG.info("Parsed event from recovery stream"
          + ", eventType=" + historyEvent.getEventType()
          + ", event=" + historyEvent);
    }
  }

  public static void main(String argv[]) throws IOException {
    // TODO clean up with better usage and error handling
    Configuration conf = new Configuration();
    String summaryPath = argv[0];
    List<String> dagPaths = new ArrayList<String>();
    if (argv.length > 1) {
      for (int i = 1; i < argv.length; ++i) {
        dagPaths.add(argv[i]);
      }
    }
    FileSystem fs = FileSystem.get(conf);
    LOG.info("Parsing Summary file " + summaryPath);
    parseSummaryFile(fs.open(new Path(summaryPath)));
    for (String dagPath : dagPaths) {
      LOG.info("Parsing DAG recovery file " + dagPath);
      parseDAGRecoveryFile(fs.open(new Path(dagPath)));
    }
  }

  private Path getSummaryPath(Path attemptRrecoveryDataDir) {
    return TezCommonUtils.getSummaryRecoveryPath(attemptRrecoveryDataDir);
  }

  private FSDataOutputStream getSummaryOutputStream(Path summaryPath)
      throws IOException {
    return recoveryFS.create(summaryPath, true, recoveryBufferSize);
  }

  private FSDataInputStream getSummaryStream(Path summaryPath)
      throws IOException {
    if (!recoveryFS.exists(summaryPath)) {
      return null;
    }
    return recoveryFS.open(summaryPath, recoveryBufferSize);
  }

  private Path getDAGRecoveryFilePath(Path recoveryDataDir,
      TezDAGID dagID) {
    return new Path(recoveryDataDir,
        dagID.toString() + TezConstants.DAG_RECOVERY_RECOVER_FILE_SUFFIX);
  }

  private FSDataInputStream getDAGRecoveryStream(Path recoveryDataDir,
      TezDAGID dagID)
      throws IOException {
    Path dagRecoveryPath = getDAGRecoveryFilePath(recoveryDataDir, dagID);
    if (!recoveryFS.exists(dagRecoveryPath)) {
      return null;
    }
    return recoveryFS.open(dagRecoveryPath, recoveryBufferSize);
  }

  private FSDataOutputStream getDAGRecoveryOutputStream(Path recoveryDataDir,
      TezDAGID dagID)
      throws IOException {
    Path dagRecoveryPath = new Path(recoveryDataDir,
        dagID.toString() + TezConstants.DAG_RECOVERY_RECOVER_FILE_SUFFIX);
    return recoveryFS.create(dagRecoveryPath, true, recoveryBufferSize);
  }

  @VisibleForTesting
  DAGSummaryData getLastCompletedOrInProgressDAG(
      Map<TezDAGID, DAGSummaryData> dagSummaryDataMap) {
    DAGSummaryData inProgressDAG = null;
    DAGSummaryData lastCompletedDAG = null;
    for (Map.Entry<TezDAGID, DAGSummaryData> entry : dagSummaryDataMap.entrySet()) {
      if (!entry.getValue().completed) {
        if (inProgressDAG != null) {
          throw new RuntimeException("Multiple in progress DAGs seen"
              + ", dagId=" + inProgressDAG.dagId
              + ", dagId=" + entry.getKey());
        }
        inProgressDAG = entry.getValue();
      } else {
        if (lastCompletedDAG == null ||
            lastCompletedDAG.dagId.getId() < entry.getValue().dagId.getId()) {
          lastCompletedDAG = entry.getValue();
        }
      }
    }
    if (inProgressDAG == null) {
      return lastCompletedDAG;
    }
    return inProgressDAG;
  }

  private Path getPreviousAttemptRecoveryDataDir() throws IOException {
    LOG.info("Looking for the correct attempt directory to recover from");
    int foundPreviousAttempt = -1;
    for (int i = currentAttemptId - 1; i > 0; --i) {
      Path attemptPath = TezCommonUtils.getAttemptRecoveryPath(recoveryDataDir, i);
      LOG.info("Looking at attempt directory, path=" + attemptPath);
      Path fatalErrorOccurred = new Path(attemptPath,
          RecoveryService.RECOVERY_FATAL_OCCURRED_DIR);
      if (recoveryFS.exists(fatalErrorOccurred)) {
        throw new IOException("Found that a fatal error occurred in"
            + " recovery during previous attempt, foundFile="
            + fatalErrorOccurred.toString());
      }

      Path dataRecoveredFile = new Path(attemptPath, dataRecoveredFileFlag);
      try {
        if (recoveryFS.exists(dataRecoveredFile)) {
          LOG.info("Found data recovered file in attempt directory"
              + ", dataRecoveredFile=" + dataRecoveredFile
              + ", path=" + attemptPath);
          foundPreviousAttempt = i;
          break;
        }
        LOG.info("Skipping attempt directory as data recovered file does not exist"
            + ", dataRecoveredFile=" + dataRecoveredFile
            + ", path=" + attemptPath);
      } catch (IOException e) {
        LOG.warn("Exception when checking previous attempt dir for "
            + dataRecoveredFile.toString(), e);
      }
    }
    if (foundPreviousAttempt == -1) {
      // Look for oldest summary file and use that
      LOG.info("Did not find any attempt dir that had data recovered file."
          + " Looking for oldest summary file");
      for (int i = 1; i < currentAttemptId; ++i) {
        Path attemptPath = TezCommonUtils.getAttemptRecoveryPath(recoveryDataDir, i);
        Path summaryPath = getSummaryPath(attemptPath);
        if (recoveryFS.exists(summaryPath)) {
          LOG.info("Found summary file in attempt directory"
              + ", summaryFile=" + summaryPath
              + ", path=" + attemptPath);
          foundPreviousAttempt = i;
          break;
        }
        LOG.info("Skipping attempt directory as no summary file found"
            + ", summaryFile=" + summaryPath
            + ", path=" + attemptPath);
      }
    }
    if (foundPreviousAttempt == -1) {
      LOG.info("Falling back to first attempt as no other recovered attempts"
          + " found");
      foundPreviousAttempt = 1;
    }

    return TezCommonUtils.getAttemptRecoveryPath(recoveryDataDir, foundPreviousAttempt);
  }

  @VisibleForTesting
  static class DAGSummaryData {

    final TezDAGID dagId;
    String dagName;
    boolean completed = false;
    boolean dagCommitCompleted = true;
    DAGState dagState;
    Map<TezVertexID, Boolean> vertexCommitStatus =
        new HashMap<TezVertexID, Boolean>();
    Map<String, Boolean> vertexGroupCommitStatus =
        new HashMap<String, Boolean>();
    List<HistoryEvent> bufferedSummaryEvents =
        new ArrayList<HistoryEvent>();

    DAGSummaryData(TezDAGID dagId) {
      this.dagId = dagId;
    }

    void handleSummaryEvent(SummaryEventProto proto) throws IOException {
      HistoryEventType eventType =
          HistoryEventType.values()[proto.getEventType()];
      switch (eventType) {
        case DAG_SUBMITTED:
          completed = false;
          DAGSubmittedEvent dagSubmittedEvent = new DAGSubmittedEvent();
          dagSubmittedEvent.fromSummaryProtoStream(proto);
          dagName = dagSubmittedEvent.getDAGName();
          break;
        case DAG_FINISHED:
          completed = true;
          dagCommitCompleted = true;
          DAGFinishedEvent dagFinishedEvent = new DAGFinishedEvent();
          dagFinishedEvent.fromSummaryProtoStream(proto);
          dagState = dagFinishedEvent.getState();
          break;
        case DAG_COMMIT_STARTED:
          dagCommitCompleted = false;
          break;
        case VERTEX_COMMIT_STARTED:
          VertexCommitStartedEvent vertexCommitStartedEvent =
              new VertexCommitStartedEvent();
          vertexCommitStartedEvent.fromSummaryProtoStream(proto);
          vertexCommitStatus.put(
              vertexCommitStartedEvent.getVertexID(), false);
          break;
        case VERTEX_FINISHED:
          VertexFinishedEvent vertexFinishedEvent =
              new VertexFinishedEvent();
          vertexFinishedEvent.fromSummaryProtoStream(proto);
          if (vertexCommitStatus.containsKey(vertexFinishedEvent.getVertexID())) {
            vertexCommitStatus.put(
                vertexFinishedEvent.getVertexID(), true);
            bufferedSummaryEvents.add(vertexFinishedEvent);
          }
          break;
        case VERTEX_GROUP_COMMIT_STARTED:
          VertexGroupCommitStartedEvent vertexGroupCommitStartedEvent =
              new VertexGroupCommitStartedEvent();
          vertexGroupCommitStartedEvent.fromSummaryProtoStream(proto);
          bufferedSummaryEvents.add(vertexGroupCommitStartedEvent);
          vertexGroupCommitStatus.put(
              vertexGroupCommitStartedEvent.getVertexGroupName(), false);
          break;
        case VERTEX_GROUP_COMMIT_FINISHED:
          VertexGroupCommitFinishedEvent vertexGroupCommitFinishedEvent =
              new VertexGroupCommitFinishedEvent();
          vertexGroupCommitFinishedEvent.fromSummaryProtoStream(proto);
          bufferedSummaryEvents.add(vertexGroupCommitFinishedEvent);
          vertexGroupCommitStatus.put(
              vertexGroupCommitFinishedEvent.getVertexGroupName(), true);
          break;
      }
    }

    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append("dagId=").append(dagId);
      sb.append(", dagCompleted=").append(completed);
      if (!vertexCommitStatus.isEmpty()) {
        sb.append(", vertexCommitStatuses=[");
        for (Entry<TezVertexID, Boolean> entry : vertexCommitStatus.entrySet()) {
          sb.append("{ vertexId=").append(entry.getKey())
              .append(", committed=").append(entry.getValue()).append("}, ");
        }
        sb.append("]");
      }
      if (!vertexGroupCommitStatus.isEmpty()) {
        sb.append(", vertexGroupCommitStatuses=[");
        for (Entry<String, Boolean> entry : vertexGroupCommitStatus.entrySet()) {
          sb.append("{ vertexGroup=").append(entry.getKey())
              .append(", committed=").append(entry.getValue()).append("}, ");
        }
        sb.append("]");
      }
      return sb.toString();
    }
  }

  private String isDAGRecoverable(DAGSummaryData data) {
    if (!data.dagCommitCompleted) {
      return "DAG Commit was in progress, not recoverable"
          + ", dagId=" + data.dagId;
    }
    if (!data.vertexCommitStatus.isEmpty()) {
      for (Entry<TezVertexID, Boolean> entry : data.vertexCommitStatus.entrySet()) {
        if (!(entry.getValue().booleanValue())) {
          return "Vertex Commit was in progress, not recoverable"
              + ", dagId=" + data.dagId
              + ", vertexId=" + entry.getKey();
        }
      }
    }
    if (!data.vertexGroupCommitStatus.isEmpty()) {
      for (Entry<String, Boolean> entry : data.vertexGroupCommitStatus.entrySet()) {
        if (!(entry.getValue().booleanValue())) {
          return "Vertex Group Commit was in progress, not recoverable"
              + ", dagId=" + data.dagId
              + ", vertexGroup=" + entry.getKey();
        }
      }
    }
    return null;
  }

  public RecoveredDAGData parseRecoveryData() throws IOException {
    Path previousAttemptRecoveryDataDir = getPreviousAttemptRecoveryDataDir();
    LOG.info("Using " + previousAttemptRecoveryDataDir.toString()
        + " for recovering data from previous attempt");
    if (!recoveryFS.exists(previousAttemptRecoveryDataDir)) {
      LOG.info("Nothing to recover as previous attempt data does not exist"
          + ", previousAttemptDir=" + previousAttemptRecoveryDataDir.toString());
      createDataRecoveredFlagFile();
      return null;
    }

    Path summaryPath = getSummaryPath(previousAttemptRecoveryDataDir);
    FSDataInputStream summaryStream = getSummaryStream(
        summaryPath);
    if (summaryStream == null) {
      LOG.info("Nothing to recover as summary file does not exist"
          + ", previousAttemptDir=" + previousAttemptRecoveryDataDir.toString()
          + ", summaryPath=" + summaryPath.toString());
      createDataRecoveredFlagFile();
      return null;
    }

    Path newSummaryPath = getSummaryPath(currentAttemptRecoveryDataDir);
    FSDataOutputStream newSummaryStream =
        getSummaryOutputStream(newSummaryPath);

    FileStatus summaryFileStatus = recoveryFS.getFileStatus(summaryPath);
    LOG.info("Parsing summary file"
        + ", path=" + summaryPath.toString()
        + ", len=" + summaryFileStatus.getLen()
        + ", lastModTime=" + summaryFileStatus.getModificationTime());

    int dagCounter = 0;

    Map<TezDAGID, DAGSummaryData> dagSummaryDataMap =
        new HashMap<TezDAGID, DAGSummaryData>();
    while (true) {
      RecoveryProtos.SummaryEventProto proto;
      try {
        proto = RecoveryProtos.SummaryEventProto.parseDelimitedFrom(summaryStream);
        if (proto == null) {
          LOG.info("Reached end of summary stream");
          break;
        }
      } catch (EOFException eof) {
        LOG.info("Reached end of summary stream");
        break;
      }
      HistoryEventType eventType =
          HistoryEventType.values()[proto.getEventType()];
      if (LOG.isDebugEnabled()) {
        LOG.debug("[RECOVERY SUMMARY]"
            + " dagId=" + proto.getDagId()
            + ", timestamp=" + proto.getTimestamp()
            + ", event=" + eventType);
      }
      TezDAGID dagId = TezDAGID.fromString(proto.getDagId());
      if (dagCounter < dagId.getId()) {
        dagCounter = dagId.getId();
      }
      if (!dagSummaryDataMap.containsKey(dagId)) {
        dagSummaryDataMap.put(dagId, new DAGSummaryData(dagId));
      }
      dagSummaryDataMap.get(dagId).handleSummaryEvent(proto);
      proto.writeDelimitedTo(newSummaryStream);
    }
    summaryStream.close();
    newSummaryStream.hsync();
    newSummaryStream.close();

    // Set counter for next set of DAGs & update dagNames Set in DAGAppMaster
    dagAppMaster.setDAGCounter(dagCounter);
    for (DAGSummaryData dagSummaryData: dagSummaryDataMap.values()){
      dagAppMaster.dagNames.add(dagSummaryData.dagName);
    }

    DAGSummaryData lastInProgressDAGData =
        getLastCompletedOrInProgressDAG(dagSummaryDataMap);
    if (lastInProgressDAGData == null) {
      LOG.info("Nothing to recover as no uncompleted/completed DAGs found");
      return null;
    }
    TezDAGID lastInProgressDAG = lastInProgressDAGData.dagId;
    if (lastInProgressDAG == null) {
      LOG.info("Nothing to recover as no uncompleted/completed DAGs found");
      return null;
    }

    LOG.info("Checking if DAG is in recoverable state"
        + ", dagId=" + lastInProgressDAGData.dagId);

    final RecoveredDAGData recoveredDAGData = new RecoveredDAGData();
    if (lastInProgressDAGData.completed) {
      recoveredDAGData.isCompleted = true;
      recoveredDAGData.dagState = lastInProgressDAGData.dagState;
    }

    String nonRecoverableReason = isDAGRecoverable(lastInProgressDAGData);
    if (nonRecoverableReason != null) {
      LOG.warn("Found last inProgress DAG but not recoverable: "
          + lastInProgressDAGData);
      recoveredDAGData.nonRecoverable = true;
      recoveredDAGData.reason = nonRecoverableReason;
    }

    LOG.info("Trying to recover dag from recovery file"
        + ", dagId=" + lastInProgressDAG.toString()
        + ", dataDir=" + previousAttemptRecoveryDataDir
        + ", intoCurrentDir=" + currentAttemptRecoveryDataDir);

    FSDataInputStream dagRecoveryStream = getDAGRecoveryStream(
        previousAttemptRecoveryDataDir, lastInProgressDAG);
    if (dagRecoveryStream == null) {
      // Could not find data to recover
      // Error out
      throw new IOException("Could not find recovery data for last in progress DAG"
          + ", dagId=" + lastInProgressDAG);
    }

    LOG.info("Copying DAG data into Current Attempt directory"
        + ", filePath=" + getDAGRecoveryFilePath(currentAttemptRecoveryDataDir,
        lastInProgressDAG));
    FSDataOutputStream newDAGRecoveryStream =
        getDAGRecoveryOutputStream(currentAttemptRecoveryDataDir, lastInProgressDAG);

    boolean skipAllOtherEvents = false;
    while (true) {
      HistoryEvent event;
      try {
        event = getNextEvent(dagRecoveryStream);
        if (event == null) {
          LOG.info("Reached end of dag recovery stream");
          break;
        }
      } catch (EOFException eof) {
        LOG.info("Reached end of dag recovery stream");
        break;
      } catch (IOException ioe) {
        LOG.warn("Corrupt data found when trying to read next event", ioe);
        break;
      }
      if (event == null || skipAllOtherEvents) {
        // reached end of data
        break;
      }
      HistoryEventType eventType = event.getEventType();
      switch (eventType) {
        case DAG_SUBMITTED:
        {
          DAGSubmittedEvent submittedEvent = (DAGSubmittedEvent) event;
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          recoveredDAGData.recoveredDAG = dagAppMaster.createDAG(submittedEvent.getDAGPlan(),
              lastInProgressDAG);
          recoveredDAGData.cumulativeAdditionalResources = submittedEvent
            .getCumulativeAdditionalLocalResources();
          recoveredDAGData.recoveredDagID = recoveredDAGData.recoveredDAG.getID();
          dagAppMaster.setCurrentDAG(recoveredDAGData.recoveredDAG);
          if (recoveredDAGData.nonRecoverable) {
            skipAllOtherEvents = true;
          }
          break;
        }
        case DAG_INITIALIZED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          recoveredDAGData.recoveredDAG.restoreFromEvent(event);
          break;
        }
        case DAG_STARTED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          recoveredDAGData.recoveredDAG.restoreFromEvent(event);
          break;
        }
        case DAG_COMMIT_STARTED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          recoveredDAGData.recoveredDAG.restoreFromEvent(event);
          break;
        }
        case VERTEX_GROUP_COMMIT_STARTED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          recoveredDAGData.recoveredDAG.restoreFromEvent(event);
          break;
        }
        case VERTEX_GROUP_COMMIT_FINISHED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          recoveredDAGData.recoveredDAG.restoreFromEvent(event);
          break;
        }
        case DAG_FINISHED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          // If this is seen, nothing to recover
          assert recoveredDAGData.recoveredDAG != null;
          recoveredDAGData.recoveredDAG.restoreFromEvent(event);
          recoveredDAGData.isCompleted = true;
          recoveredDAGData.dagState =
              ((DAGFinishedEvent) event).getState();
          skipAllOtherEvents = true;
        }
        case CONTAINER_LAUNCHED:
        {
          // Nothing to do for now
          break;
        }
        case VERTEX_INITIALIZED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          VertexInitializedEvent vEvent = (VertexInitializedEvent) event;
          Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID());
          v.restoreFromEvent(vEvent);
          break;
        }
        case VERTEX_STARTED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          VertexStartedEvent vEvent = (VertexStartedEvent) event;
          Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID());
          v.restoreFromEvent(vEvent);
          break;
        }
        case VERTEX_PARALLELISM_UPDATED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          VertexParallelismUpdatedEvent vEvent = (VertexParallelismUpdatedEvent) event;
          Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID());
          v.restoreFromEvent(vEvent);
          break;
        }
        case VERTEX_COMMIT_STARTED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          VertexCommitStartedEvent vEvent = (VertexCommitStartedEvent) event;
          Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID());
          v.restoreFromEvent(vEvent);
          break;
        }
        case VERTEX_FINISHED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          VertexFinishedEvent vEvent = (VertexFinishedEvent) event;
          Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID());
          v.restoreFromEvent(vEvent);
          break;
        }
        case TASK_STARTED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          TaskStartedEvent tEvent = (TaskStartedEvent) event;
          Task task = recoveredDAGData.recoveredDAG.getVertex(
              tEvent.getTaskID().getVertexID()).getTask(tEvent.getTaskID());
          task.restoreFromEvent(tEvent);
          break;
        }
        case TASK_FINISHED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          TaskFinishedEvent tEvent = (TaskFinishedEvent) event;
          Task task = recoveredDAGData.recoveredDAG.getVertex(
              tEvent.getTaskID().getVertexID()).getTask(tEvent.getTaskID());
          task.restoreFromEvent(tEvent);
          break;
        }
        case TASK_ATTEMPT_STARTED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          TaskAttemptStartedEvent tEvent = (TaskAttemptStartedEvent) event;
          Task task =
              recoveredDAGData.recoveredDAG.getVertex(
                  tEvent.getTaskAttemptID().getTaskID().getVertexID())
                      .getTask(tEvent.getTaskAttemptID().getTaskID());
          task.restoreFromEvent(tEvent);
          break;
        }
        case TASK_ATTEMPT_FINISHED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          TaskAttemptFinishedEvent tEvent = (TaskAttemptFinishedEvent) event;
          Task task =
              recoveredDAGData.recoveredDAG.getVertex(
                  tEvent.getTaskAttemptID().getTaskID().getVertexID())
                  .getTask(tEvent.getTaskAttemptID().getTaskID());
          task.restoreFromEvent(tEvent);
          break;
        }
        case VERTEX_DATA_MOVEMENT_EVENTS_GENERATED:
        {
          LOG.info("Recovering from event"
              + ", eventType=" + eventType
              + ", event=" + event.toString());
          assert recoveredDAGData.recoveredDAG != null;
          VertexDataMovementEventsGeneratedEvent vEvent =
              (VertexDataMovementEventsGeneratedEvent) event;
          Vertex v = recoveredDAGData.recoveredDAG.getVertex(vEvent.getVertexID());
          v.restoreFromEvent(vEvent);
          break;
        }
        default:
          throw new RuntimeException("Invalid data found, unknown event type "
              + eventType);
      }
      if (LOG.isDebugEnabled()) {
        LOG.debug("[DAG RECOVERY]"
            + " dagId=" + lastInProgressDAG
            + ", eventType=" + eventType
            + ", event=" + event.toString());
      }
      newDAGRecoveryStream.writeInt(eventType.ordinal());
      event.toProtoStream(newDAGRecoveryStream);
    }
    dagRecoveryStream.close();
    newDAGRecoveryStream.hsync();
    newDAGRecoveryStream.close();

    if (!recoveredDAGData.isCompleted
        && !recoveredDAGData.nonRecoverable) {
      if (lastInProgressDAGData.bufferedSummaryEvents != null
        && !lastInProgressDAGData.bufferedSummaryEvents.isEmpty()) {
        for (HistoryEvent bufferedEvent : lastInProgressDAGData.bufferedSummaryEvents) {
          assert recoveredDAGData.recoveredDAG != null;
          switch (bufferedEvent.getEventType()) {
            case VERTEX_GROUP_COMMIT_STARTED:
              recoveredDAGData.recoveredDAG.restoreFromEvent(bufferedEvent);
              break;
            case VERTEX_GROUP_COMMIT_FINISHED:
              recoveredDAGData.recoveredDAG.restoreFromEvent(bufferedEvent);
              break;
            case VERTEX_FINISHED:
              VertexFinishedEvent vertexFinishedEvent =
                  (VertexFinishedEvent) bufferedEvent;
              Vertex vertex = recoveredDAGData.recoveredDAG.getVertex(
                  vertexFinishedEvent.getVertexID());
              if (vertex == null) {
                recoveredDAGData.nonRecoverable = true;
                recoveredDAGData.reason = "All state could not be recovered"
                    + ", vertex completed but events not flushed"
                    + ", vertexId=" + vertexFinishedEvent.getVertexID();
              } else {
                vertex.restoreFromEvent(vertexFinishedEvent);
              }
              break;
            default:
              throw new RuntimeException("Invalid data found in buffered summary events"
                  + ", unknown event type "
                  + bufferedEvent.getEventType());
          }
        }
      }
    }

    LOG.info("Finished copying data from previous attempt into current attempt");
    createDataRecoveredFlagFile();

    return recoveredDAGData;
  }

  private void createDataRecoveredFlagFile() throws IOException {
    Path dataCopiedFlagPath = new Path(currentAttemptRecoveryDataDir,
        dataRecoveredFileFlag);
    LOG.info("Trying to create data recovered flag file"
        + ", filePath=" + dataCopiedFlagPath.toString());
    recoveryFS.mkdirs(dataCopiedFlagPath);
  }

}
TOP

Related Classes of org.apache.tez.dag.app.RecoveryParser$RecoveredDAGData

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.