Package org.apache.hadoop.hdfs.server.namenode.bookkeeper

Source Code of org.apache.hadoop.hdfs.server.namenode.bookkeeper.BookKeeperJournalManager$FencingLedgerHandleProvider

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode.bookkeeper;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import org.apache.bookkeeper.client.BKException;
import org.apache.bookkeeper.client.BookKeeper;
import org.apache.bookkeeper.client.LedgerHandle;
import org.apache.bookkeeper.conf.ClientConfiguration;
import org.apache.bookkeeper.util.ZkUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.Transition;
import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.namenode.EditLogInputStream;
import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader;
import org.apache.hadoop.hdfs.server.namenode.JournalManager;
import org.apache.hadoop.hdfs.server.namenode.RemoteStorageState;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.BookKeeperJournalMetadataManager;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.CurrentInProgressMetadata;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.EditLogLedgerMetadata;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.MaxTxId;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.Versioned;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.proto.FormatInfoWritable;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.proto.WritableUtil;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.BasicZooKeeper;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.ConnectionWatcher;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.RecoveringZooKeeper;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.util.InjectionHandler;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.Code;
import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.zookeeper.ZooKeeper;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;

import static org.apache.hadoop.hdfs.server.namenode.bookkeeper.BookKeeperJournalConfigKeys.*;
import static org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.ZkUtil.*;
import static org.apache.zookeeper.AsyncCallback.*;

/**
* BookKeeper-based JournalManager implementation. This is inspired by
* Apache's BookKeeperJournalManager, with several core differences:
* interaction with ZooKeeper goes through {@link RecoveringZooKeeper},
* custom {@link BookKeeperEditLogInputStream} implementation is used that
* permits tailing in-progress edits and re-positioning within an ledger-based
* output stream, and a custom {@link BookKeeperEditLogOutputStream} is used
* that uses double buffer as used by the standard file journal manager
* implementation.
*/
public class BookKeeperJournalManager implements JournalManager, LedgerHandleProvider {

  private static final Log LOG =
      LogFactory.getLog(BookKeeperJournalManager.class);

  // Version of the protocol used for serializing and de-serializing data in
  // znodes (i.e., the Writables)

  static final int PROTO_VERSION = -1;

  private final Configuration conf; // Configuration

  private final int quorumSize; // BookKeeper quorum size
  private final int ensembleSize; // BookKeeper cluster size
  private final BookKeeper bookKeeperClient; // BookKeeper client
  private final RecoveringZooKeeper zk;

  private final String digestPw; // BookKeeper digest password
  @VisibleForTesting
  protected final String zkParentPath; // Parent ZNode
  @VisibleForTesting
  protected final String formatInfoPath; // ZNode holding format/namespace information

  // Handles ledger metadata
  final BookKeeperJournalMetadataManager metadataManager;
  private final MaxTxId maxTxId;  // stores max txid

  private final CurrentInProgressMetadata currentInProgressMetadata;

  private boolean initialized = false;
  private LedgerHandle currentInProgressLedger = null; // Current ledger

  @VisibleForTesting
  volatile String currentInProgressPath;

  private volatile NameNodeMetrics metrics = null;

  private long maxSeenTxId = -1;

  private static final ThreadLocal<FormatInfoWritable>
      localFormatInfoWritable = new ThreadLocal<FormatInfoWritable>() {
    @Override
    protected FormatInfoWritable initialValue() {
      return new FormatInfoWritable();
    }
  };

  public BookKeeperJournalManager(Configuration conf, URI uri,
      NamespaceInfo nsInfo, NameNodeMetrics metrics)
      throws IOException {
    this.conf = conf;
    this.metrics = metrics;
    quorumSize = conf.getInt(BKJM_BOOKKEEPER_QUORUM_SIZE,
        BKJM_BOOKKEEPER_QUORUM_SIZE_DEFAULT);
    ensembleSize = conf.getInt(BKJM_BOOKKEEPER_ENSEMBLE_SIZE,
        BKJM_BOOKKEEPER_ENSEMBLE_SIZE_DEFAULT);
    digestPw = conf.get(BKJM_BOOKKEEPER_DIGEST_PW,
        BKJM_BOOKKEEPER_DIGEST_PW_DEFAULT);
    String zkConnect = uri.getAuthority().replace(";", ",");
    zkParentPath = uri.getPath();
    String ledgersAvailablePath = conf.get(
        BKJM_ZK_LEDGERS_AVAILABLE_PATH,
        BKJM_ZK_LEDGERS_AVAILABLE_PATH_DEFAULT);
    formatInfoPath = joinPath(zkParentPath, "version");
    String currentInProgressPath = joinPath(zkParentPath, "CurrentInProgress");
    String maxTxIdPath = joinPath(zkParentPath, "maxtxid");
    int zkSessionTimeoutMs = conf.getInt(BKJM_ZK_SESSION_TIMEOUT,
        BKJM_ZK_SESSION_TIMEOUT_DEFAULT);
    int zkMaxRetries = conf.getInt(BKJM_ZK_MAX_RETRIES,
       BKJM_ZK_MAX_RETRIES_DEFAULT);
    int zkRetryIntervalMs = conf.getInt(BKJM_ZK_RETRY_INTERVAL,
        BKJM_ZK_RETRY_INTERVAL_DEFAULT);
    CountDownLatch connectLatch = new CountDownLatch(1);
    ConnectionWatcher connectionWatcher = new ConnectionWatcher(connectLatch);
    ZooKeeper zooKeeper = new ZooKeeper(zkConnect, zkSessionTimeoutMs,
        connectionWatcher);
    // Use twice session timeout as the connection timeout
    int zkConnectTimeoutMs = zkSessionTimeoutMs * 2;

    if (!connectionWatcher.await(zkConnectTimeoutMs)) {
      throw new IOException("Timed out waiting to connect to " + zkConnect
          + " after " + (zkSessionTimeoutMs * 2) + " ms.");
    }
    prepareBookKeeperEnv(ledgersAvailablePath, zooKeeper);

    try {
      ClientConfiguration clientConf = new ClientConfiguration();
      clientConf.setClientTcpNoDelay(conf.getBoolean(
          BKJM_BOOKKEEPER_CLIENT_TCP_NODELAY,
          BKJM_BOOKKEEPER_CLIENT_TCP_NO_DELAY_DEFAULT));
      clientConf.setThrottleValue(conf.getInt(BKJM_BOOKKEEPER_CLIENT_THROTTLE,
          BKJM_BOOKKEEPER_CLIENT_THROTTLE_DEFAULT));
      bookKeeperClient = new BookKeeper(clientConf, zooKeeper);
    } catch (KeeperException e) {
      keeperException("Unrecoverable ZooKeeper creating BookKeeper client",
          e);
      throw new IllegalStateException(e); // never reached
    } catch (InterruptedException e) {
      interruptedException("Interrupted creating a BookKeeper client", e);
      throw new IllegalStateException(e); // never reached
    }
    zk = new RecoveringZooKeeper(new BasicZooKeeper(zooKeeper), zkMaxRetries,
        zkRetryIntervalMs);
    metadataManager = new BookKeeperJournalMetadataManager(zk, zkParentPath);
    maxTxId = new MaxTxId(zk, maxTxIdPath);
    currentInProgressMetadata = new CurrentInProgressMetadata(zk,
        currentInProgressPath);
    createZkMetadataIfNotExists(nsInfo);
    metadataManager.init();
  }

  public static void bkException(String msg, BKException e) throws IOException {
    LOG.error(msg, e);
    throw new IOException(msg, e);
  }

  /**
   * Create parent ZNode under which available BookKeeper bookie servers will
   * register themselves. Will create parent ZNodes for that path as well.
   * @see ZkUtils#createFullPathOptimistic(ZooKeeper, String, byte[], List, CreateMode, StringCallback, Object)
   * @param availablePath Full ZooKeeper path for bookies to register
   *                      themselves.
   * @param zooKeeper Fully instantiated ZooKeeper instance.
   * @throws IOException If we are unable to successfully create the path
   *                     during the time specified as the ZooKeeper session
   *                     timeout.
   */
  @VisibleForTesting
  public static void prepareBookKeeperEnv(final String availablePath,
      ZooKeeper zooKeeper) throws IOException {
    final CountDownLatch availablePathLatch =  new CountDownLatch(1);
    StringCallback cb = new StringCallback() {
      @Override
      public void processResult(int rc, String path, Object ctx, String name) {
        if (Code.OK.intValue() == rc || Code.NODEEXISTS.intValue() == rc) {
          availablePathLatch.countDown();
          LOG.info("Successfully created bookie available path:" +
              availablePath);
        } else {
          Code code = Code.get(rc);
          LOG.error("Failed to create available bookie path (" +
              availablePath + ")", KeeperException.create(code, path));
        }
      }
    };
    ZkUtils.createFullPathOptimistic(zooKeeper, availablePath, new byte[0],
        Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, cb, null);
    try {
      int timeoutMs = zooKeeper.getSessionTimeout();
      if (!availablePathLatch.await(timeoutMs, TimeUnit.MILLISECONDS)) {
        throw new IOException("Couldn't create the bookie available path : " +
            availablePath + ", timed out after " + timeoutMs + " ms.");
      }
    } catch (InterruptedException e) {
      Thread.currentThread().interrupt();
      throw new IOException("Interrupted when creating the bookie available " +
          "path: " + availablePath, e);
    }
  }

  /**
   * If environment information has yet not been read during the object's life
   * do so and verify that it has been written the expected protocol version.
   * Additionally, the call always refreshes the object's current
   * {@link CurrentInProgressMetadata} information.
   */
  synchronized private void checkEnv() throws IOException {
    if (!initialized) {
      FormatInfoWritable writable = localFormatInfoWritable.get();
      if (metadataManager.readWritableFromZk(formatInfoPath, writable, null) == null) {
        LOG.error("Environment not initialized (format() not called?)");
        throw new IOException(
            "Environment not initialized (format() not called?");
      }
      if (writable.getProtoVersion() != PROTO_VERSION) {
        throw new IllegalStateException("Wrong protocol version! Expected " +
            BKJM_BOOKKEEPER_DIGEST_PW + ", but read " +
            writable.getProtoVersion());
      }
      if (LOG.isDebugEnabled()) {
        LOG.debug("Namespace info read: " + writable.toColonSeparatedString());
      }
    }
    currentInProgressMetadata.init();
    initialized = true;
  }

  @VisibleForTesting
  public LedgerHandle openForReading(long ledgerId) throws IOException {
    try {
      return bookKeeperClient.openLedgerNoRecovery(
          ledgerId, BookKeeper.DigestType.MAC, digestPw.getBytes());
    } catch (InterruptedException e) {
      interruptedException("Interrupted while opening ledger id " + ledgerId +
          " for reading", e);
    } catch (BKException e) {
      bkException("BookKeeper error opening ledger id " + ledgerId +
          " for reading", e);
    }
    return null; // Should not be reached
  }

  @Override
  public void transitionJournal(StorageInfo si, Transition transition,
      StartupOption startOpt) throws IOException {
    if (transition == Transition.FORMAT) {
      deleteMetadataAndLedgers();
      createZkMetadataIfNotExists(si);
      metadataManager.init();
    } else {
      throw new UnsupportedOperationException();
    }
  }

  /**
   * If ZooKeeper metadata is not empty, forcefully delete the metadata
   * and make a best effort attempt at deleting the ledgers. Used by
   * {@link #formatJournal(StorageInfo)}
   * @throws IOException If there is an error talking to BookKeeper or
   *                     ZooKeeper
   */
  private void deleteMetadataAndLedgers() throws IOException {
    try {
      if (hasSomeJournalData()) {
        if (zkPathExists(metadataManager.getLedgerParentPath())) {
          for (EditLogLedgerMetadata ledger : metadataManager.listLedgers(true)) {
            try {
              // Try to delete the individual ledger from BookKeeper
              bookKeeperClient.deleteLedger(ledger.getLedgerId());
            } catch (BKException e) {
              // It is fine if we are unable to delete the ledger, as it will
              // not be read and can then be deleted manually.
              LOG.warn("Unable to delete ledger " + ledger + " from BookKeeper",
                  e);
            } catch (InterruptedException e) {
              interruptedException("Interrupted deleting ledger " + ledger, e);
            }
          }
        }
        deleteRecursively(zk, zkParentPath);
      }
    } catch (IOException e) {
      LOG.error("Error clearing out metadata under " + zkParentPath, e);
      throw e;
    }
  }

  /**
   * If there is no metadata present in ZooKeeper, create and populate the
   * metadata with the right format information
   * @param si The format information to set
   * @throws IOException If there is an error writing to ZooKeeper
   */
  private void createZkMetadataIfNotExists(StorageInfo si) throws IOException {
    try {
      if (!hasSomeJournalData()) {
        try {
          // First create the parent path
          zk.create(zkParentPath, new byte[] { '0' },
              Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);

          // Write format/namespace information to ZooKeeper
          FormatInfoWritable writable = localFormatInfoWritable.get();
          writable.set(PROTO_VERSION, si);
          byte[] data = WritableUtil.writableToByteArray(writable);
          zk.create(formatInfoPath, data, Ids.OPEN_ACL_UNSAFE,
              CreateMode.PERSISTENT);
        } catch (KeeperException e) {
          keeperException("Unrecoverable ZooKeeper error initializing " +
              zkParentPath, e);
        } catch (InterruptedException e) {
          interruptedException("Interrupted initializing " + zkParentPath +
              " in ZooKeeper", e);
        }
      }
    } catch (IOException e) {
      LOG.error("Unable to initialize metadata", e);
      throw e;
    }
  }

  /**
   * Check if a path exists in ZooKeeper
   * @param path The ZNode path to check
   * @return True if path exists, false if otherwise
   * @throws IOException If there is an error talking to ZooKeeper
   */
  private boolean zkPathExists(String path) throws IOException {
    try {
      return zk.exists(path, false) != null;
    } catch (KeeperException e) {
      keeperException("Unrecoverable ZooKeeper error checking if " +
          path + " exists", e);
    } catch (InterruptedException e) {
      interruptedException("Interrupted checking if ZooKeeper path " +
          path + " exists", e);
    }
    return false; // Should never be reached
  }

  @Override
  public EditLogOutputStream startLogSegment(long txId) throws IOException {
    if (LOG.isDebugEnabled()) {
      LOG.debug("Trying to start a log segment at txId " + txId);
    }

    checkEnv();

    try {
      long currMaxTxId = maxTxId.get();
      if (txId <= currMaxTxId) {
        throw new IOException("Already saw up to txId " + currMaxTxId + "!");
      }

      String existingInProgress = currentInProgressMetadata.read();
      if (existingInProgress != null &&
          metadataManager.ledgerExists(existingInProgress)) {
        throw new IOException(existingInProgress + " already exists, cannot "
            + " start a log segment that is already in progress!");
      }
    } catch (IOException e) {
      LOG.error("Unable to start log segment for txId " + txId, e);
      throw e;
    }

    try {
      // There was an error handling on the last stream, so close it
      if (currentInProgressLedger != null) {
        currentInProgressLedger.close();
      }
      currentInProgressLedger = bookKeeperClient.createLedger(ensembleSize,
          quorumSize, BookKeeper.DigestType.MAC, digestPw.getBytes());
    } catch (BKException e) {
      bkException("BookKeeper error creating ledger for txId " + txId, e);
    } catch (InterruptedException e) {
      interruptedException("Interrupted creating ledger for txId " + txId, e);
    }

    // Create metadata for associated with the edit log segment starting at
    // txId in ZooKeeper
    EditLogLedgerMetadata ledgerMetadata = new EditLogLedgerMetadata(
        FSConstants.LAYOUT_VERSION, currentInProgressLedger.getId(), txId, -1);
    String ledgerFullPath =
        metadataManager.fullyQualifiedPathForLedger(ledgerMetadata);
    metadataManager.writeEditLogLedgerMetadata(ledgerFullPath, ledgerMetadata);
    maxTxId.store(txId);
    currentInProgressMetadata.update(ledgerFullPath);

    // Used by recoverUnfinalizedSegments()
    currentInProgressPath = ledgerFullPath;

    BookKeeperEditLogOutputStream out = new BookKeeperEditLogOutputStream(
        currentInProgressLedger, zkParentPath, metrics);
    out.create(); // Write the ledger header and flush it to BookKeeper

    InjectionHandler.processEvent(InjectionEvent.BKJM_STARTLOGSEGMENT,
        ledgerMetadata);
    return out;
  }

  @Override
  public void finalizeLogSegment(long firstTxId, long lastTxId)
      throws IOException {
    checkEnv();

    try {
      // First, find an in-progress ledger starting at firstTxId
      Versioned<EditLogLedgerMetadata> inProgressMetaAndVersion =
          metadataManager.findInProgressLedger(firstTxId);

      if (inProgressMetaAndVersion == null) {
        throw new IOException(
            "Cannot find metadata for an in-progress ledger with first txId "
                + firstTxId);
      }

      EditLogLedgerMetadata inProgressMeta = inProgressMetaAndVersion.getEntry();

      if (currentInProgressLedger != null) {
        long inProgressLedgerId = currentInProgressLedger.getId();

        if (inProgressMeta.getLedgerId() == inProgressLedgerId) {
          // If the segment is already // If the segment is currently
          // in-progress, then finalize the ledger (this ensures every entry
          // in the ledger committed to the BookKeeper quorum)
          try {
            currentInProgressLedger.close();
          } catch (BKException e) {
            bkException("Unexpected BookKeeper error closing ledger id " +
                inProgressLedgerId, e);
          } catch (InterruptedException e) {
            interruptedException("Interrupted closing ledger id " +
                inProgressLedgerId, e);
          }
          currentInProgressPath = null;
          currentInProgressLedger = null;
        } else { // We can not finalize a ledger that is not in-progress
          throw new IOException("Current in-progress ledger has ledger id (" +
              inProgressLedgerId + ") different from expected ledger id " +
              inProgressMeta.getLedgerId());
        }
      }

      // Set lastTxId in the metadata and persist it to ZooKeeper
      EditLogLedgerMetadata finalizedMeta =
          inProgressMeta.finalizeWithLastTxId(lastTxId);
      String finalizedPath =
          metadataManager.fullyQualifiedPathForLedger(finalizedMeta);
      if (LOG.isDebugEnabled()) {
        LOG.debug("Attempting to finalize metadata " + finalizedMeta +
            " to ZNode " + finalizedPath);
      }
      if (!metadataManager.writeEditLogLedgerMetadata(finalizedPath, finalizedMeta)
          && !metadataManager.verifyEditLogLedgerMetadata(inProgressMeta, finalizedPath)) {
        throw new IOException("Node " + finalizedPath +
            " already exists, but data doesn't match " + finalizedMeta);
      }
      maxTxId.store(lastTxId);


      // Find the ZNode path for the metadata associated with the in-progress
      // version of the ledger
      String lastInProgressPath =
          metadataManager.fullyQualifiedPathForLedger(inProgressMeta);
      String inProgressPathFromCiMeta = currentInProgressMetadata.read();
      if (lastInProgressPath.equals(inProgressPathFromCiMeta)) {
        // If the ZNode path matches the ZNode path for the current in-progress
        // metadata, then clear the current in-progress metadata
        currentInProgressMetadata.clear();
      }

      // Delete the in-progress metadata iff no one else has updated it in
      // the mean while
      if (!metadataManager.deleteLedgerMetadata(inProgressMeta,
          inProgressMetaAndVersion.getVersion())) {
        throw new IOException(
            "Unable to delete in-progress znode " + lastInProgressPath +
                " as it no longer exists (Deleted by another process?)");
      }
    } catch (IOException e) {
      LOG.error("Unable to finalized metadata for segment with firstTxId " +
          firstTxId + ", lastTxId " + lastTxId, e);
      throw e;
    }
  }

  /**
   * An implementation of {@link LedgerHandleProvider} that fences the
   * ledger we are reading from, allowing the ledger to be recovered by
   * BookKeeper as we validate it.
   *
   * @see BookKeeperEditLogInputStream#validateEditLog(LedgerHandleProvider, EditLogLedgerMetadata)
   */
  class FencingLedgerHandleProvider implements LedgerHandleProvider {

    @Override
    public LedgerHandle openForReading(long ledgerId) throws IOException {
      try {
        LOG.info("Opening ledger id " + ledgerId + " for recovery...");

        LedgerHandle lh = bookKeeperClient.openLedger(ledgerId,
            BookKeeper.DigestType.MAC, digestPw.getBytes());

        if (lh.getId() != ledgerId) { // Verify that correct ledger is opened
          throw new IllegalStateException("Ledger id " + lh.getId() +
              " does not match requested ledger id " + ledgerId);
        }

        LOG.info("Opened ledger id " + ledgerId + " for recovery!");
        return lh;
      } catch (BKException e) {
        bkException("BookKeeper error opening ledger id " + ledgerId +
            " for recovery", e);
      } catch (InterruptedException e) {
        interruptedException("Interrupted opening ledger id " + ledgerId +
            "for recovery", e);
      }
      return null;
    }

  }

  @VisibleForTesting
  long validateAndGetEndTxId(EditLogLedgerMetadata ledger) throws IOException {
    return validateAndGetEndTxId(ledger, false);
  }

  long validateAndGetEndTxId(EditLogLedgerMetadata ledger, boolean fence)
      throws IOException {
    FSEditLogLoader.EditLogValidation val;
    if (!fence) {
      val = BookKeeperEditLogInputStream.validateEditLog(this, ledger);
    } else {
      val = BookKeeperEditLogInputStream.validateEditLog(
          new FencingLedgerHandleProvider(), ledger);
    }
    InjectionHandler.processEvent(InjectionEvent.BKJM_VALIDATELOGSEGMENT,
        val);
    if (val.getNumTransactions() == 0) {
      return HdfsConstants.INVALID_TXID; // Ledger is corrupt
    }
    return val.getEndTxId();
  }

  private List<EditLogLedgerMetadata> getLedgers(long fromTxId) throws IOException {
    Collection<EditLogLedgerMetadata> allLedgers =
        metadataManager.listLedgers(true);
    List<EditLogLedgerMetadata> ledgers = new ArrayList<EditLogLedgerMetadata>();
    for (EditLogLedgerMetadata ledger : allLedgers) {
      if (ledger.getLastTxId() != -1 &&
          fromTxId > ledger.getFirstTxId() &&
          fromTxId <= ledger.getLastTxId()) {
        throw new IOException("Asked for fromTxId " + fromTxId +
            " which is in the middle of " + ledger);
      }
      if (fromTxId <= ledger.getFirstTxId()) {
        ledgers.add(ledger);
      }
    }

    return ledgers;
  }

  private long findMaxTransaction() throws IOException {
    List<EditLogLedgerMetadata> ledgers = getLedgers(0);
    synchronized (this) {
      for (EditLogLedgerMetadata ledgerMetadata : ledgers) {
        if (ledgerMetadata.getLastTxId() == -1) {
          maxSeenTxId = Math.max(ledgerMetadata.getFirstTxId(), maxSeenTxId);
        }
        maxSeenTxId = Math.max(ledgerMetadata.getLastTxId(), maxSeenTxId);
      }
    }
    return maxSeenTxId;
  }

  /**
   * For edit log segment that contains transactions with ids earlier than the
   * earliest txid to be retained, remove the ZooKeeper-based metadata and
   * BookKeeper ledgers associated with these segments.
   *
   * @param minTxIdToKeep the earliest txid that must be retained after purging
   *                      old logs
   * @throws IOException If there is an error talking to BookKeeper or
   *                     ZooKeeper
   */
  @Override
  public void purgeLogsOlderThan(long minTxIdToKeep) throws IOException {
    checkEnv();

    Collection<EditLogLedgerMetadata> ledgers =
        metadataManager.listLedgers(false); // Don't list in-progress ledgers

    for (EditLogLedgerMetadata ledger : ledgers) {
      if (ledger.getFirstTxId() < minTxIdToKeep  &&
          ledger.getLastTxId() < minTxIdToKeep) {
        LOG.info("Purging edit log segment: " + ledger);

        // Try to delete the associated ZooKeeper metadata
        if (!metadataManager.deleteLedgerMetadata(ledger, -1)) {
          // It's possible that another process has already deleted the
          // metadata
          LOG.warn(ledger + " has already been purged!");
        } else {
          try {
            // Remove the ledger from BookKeeper itself to reclaim diskspace.
            bookKeeperClient.deleteLedger(ledger.getLedgerId());
          } catch (BKException e) {
            bkException("Unrecoverable error deleting " + ledger +
                " from BookKeeper", e);
          } catch (InterruptedException e) {
            interruptedException("Interrupted deleting " + ledger +
                " from BookKeeper", e);
          }
        }
      }
    }
  }
 
  @Override
  public void setCommittedTxId(long txid, boolean force) {
  }

  @Override
  synchronized public void recoverUnfinalizedSegments() throws IOException {
    checkEnv();

    Collection<EditLogLedgerMetadata> allLedgers =
        metadataManager.listLedgers(true);

    for (EditLogLedgerMetadata ledger : allLedgers) {
      if (ledger.getLastTxId() != -1) {
        continue; // Only un-finalized segments may be recovered
      }

      String ledgerPath = metadataManager.fullyQualifiedPathForLedger(ledger);
      if (currentInProgressPath != null &&
          ledgerPath.equals(currentInProgressPath)) {
        // Do not recover the current in-progress segment
        continue;
      }

      // First open the ledger without fencing in order to check the length
      // of the ledger (to check for any zero-length ledgers that may have
      // been the result of a crash).
      LedgerHandle ledgerHandle = openForReading(ledger.getLedgerId());
      try {
        if (ledgerHandle.getLength() == 0) {
          handleZeroLengthLedger(ledger); // Delete any zero-length ledgers
          continue;
        }
      } finally {
        try {
          ledgerHandle.close();
        } catch (BKException e) {
          bkException("BookKeeper error closing ledger id " +
              ledger.getLedgerId(), e);
        } catch (InterruptedException e) {
          interruptedException("Interrupted closing ledger id " +
          ledger.getLedgerId(), e);
        }
      }

      // Fence the ledger and validate it as it's being recovered by BookKeeper
      long endTxId = validateAndGetEndTxId(ledger, true);

      findMaxTransaction(); // Update maxTxId seen so far by this instance

      if (endTxId == HdfsConstants.INVALID_TXID) {
        LOG.warn(ledger + "(" + ledgerPath + ")" + " cannot be recovered!");
        metadataManager.moveAsideCorruptLedger(ledger);
        continue;
      }

      // Now finalize the ledger
      finalizeLogSegment(ledger.getFirstTxId(), endTxId);
    }
  }

  private void handleZeroLengthLedger(EditLogLedgerMetadata ledger)
      throws IOException {
    LOG.warn("In-progress edit log segment " + ledger + " refers to an " +
       "empty edit log segment. This occurs when NameNode crashes after " +
       "opening a segment, but before writing OP_START_LOG_SEGMENT. Will " +
       "delete the ledger and the metadata.");
    if (maxTxId.get() == ledger.getFirstTxId()) {
      LOG.warn("maxTxId is set to " + ledger.getFirstTxId() + " which is " +
          "belongs to an empty ledger. Resetting to previous maxTxId.");
      maxTxId.set(maxTxId.get() - 1);
    }
    metadataManager.deleteLedgerMetadata(ledger, -1);
    try {
      bookKeeperClient.deleteLedger(ledger.getLedgerId());
    } catch (BKException e) {
      bkException("BookKeeper error deleting empty ledger id " +
          ledger.getLedgerId(), e);
    } catch (InterruptedException e) {
      interruptedException(
          "Interrupted deleting empty ledger id " +
              ledger.getLedgerId(), e);
    }
  }

  @Override
  public RemoteEditLogManifest getEditLogManifest(long fromTxId)
      throws IOException {
    Collection<EditLogLedgerMetadata> ledgers =
        metadataManager.listLedgers(true);
    LOG.info("Ledgers to include in manifest: " + ledgers);

    List<RemoteEditLog> ret = Lists.newArrayListWithCapacity(ledgers.size());

    for (EditLogLedgerMetadata ledger : ledgers) {
      long endTxId = ledger.getLastTxId();
      boolean isInProgress = endTxId == -1;
      if (isInProgress) {
        endTxId = validateAndGetEndTxId(ledger);
      }

      if (endTxId == HdfsConstants.INVALID_TXID) {
        continue;
      }

      if (ledger.getFirstTxId() >= fromTxId) {
        ret.add(new RemoteEditLog(ledger.getFirstTxId(),
            endTxId,
            isInProgress));
      } else if ((fromTxId > ledger.getFirstTxId()) &&
                 (fromTxId <= endTxId)) {
        throw new IOException("Asked for firstTxId " + fromTxId +
            " which is in the middle of ledger " + ledger);
      }
    }

    Collections.sort(ret);
    return new RemoteEditLogManifest(ret, false);
  }

  private void closeBk() throws IOException {
    try {
      bookKeeperClient.close();
    } catch (BKException e) {
      bkException("Error closing BookKeeper client", e);
    } catch (InterruptedException e) {
      interruptedException("Interrupted closing BookKeeper client ", e);
    }
  }

  private void closeZk() throws IOException {
    try {
      zk.close();
    } catch (InterruptedException e) {
      interruptedException("Interrupted closing ZooKeeper client", e);
    }
  }

  @Override
  public void close() throws IOException {
    try {
      closeBk();
    } finally {
      if (!Thread.currentThread().isInterrupted()) {
        closeZk();
      }
    }
  }

  @Override
  public void selectInputStreams(Collection<EditLogInputStream> streams,
      long fromTxId,
      boolean inProgressOk,
      boolean validateInProgressSegments) throws IOException {
    Collection<EditLogLedgerMetadata> allLedgers = getLedgers(fromTxId);
    if (LOG.isDebugEnabled()) {
      LOG.debug(this + ": selecting input streams starting at " + fromTxId +
          (inProgressOk ? " (inProgress ok) " : "(excluding inProgress) " ) +
          "from among " + allLedgers.size() + " candidate ledger(s).");
    }
    addStreamsToCollectionFromLedgers(allLedgers, streams, fromTxId,
        inProgressOk, validateInProgressSegments);
  }

  void addStreamsToCollectionFromLedgers(
      Collection<EditLogLedgerMetadata> allLedgers,
      Collection<EditLogInputStream> streams, long fromTxId,
      boolean inProgressOk, boolean validateInProgressSegments) throws IOException {
    for (EditLogLedgerMetadata ledger : allLedgers) {
      long endTxId = ledger.getLastTxId();
      if (endTxId == -1) {
        if (!inProgressOk) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Passing over " + ledger + " because it is in progress " +
                " and we are ignoring in-progress logs.");
            continue;
          }
        }
        if (validateInProgressSegments) {
          try {
            endTxId = validateAndGetEndTxId(ledger);
          } catch (IOException e)  {
            LOG.error("Got an IOException while trying to validate header of "
                + ledger + ". Skipping.", e);
            continue;
          }
        } else {
          LOG.info("Skipping validation of edit segment: " + ledger);
        }
      }
      if (endTxId != HdfsConstants.INVALID_TXID && endTxId < fromTxId) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Passing over " + ledger + " because it ends at " +
              endTxId + ", but we only care about transaction as new as " +
              fromTxId);
        }
        continue;
      }
      BookKeeperEditLogInputStream bkelis = new BookKeeperEditLogInputStream(
          this, ledger.getLedgerId(), 0, ledger.getFirstTxId(), endTxId,
          ledger.getLastTxId() == -1);
      bkelis.setJournalManager(this);
      streams.add(bkelis);
    }
  }

  @Override
  public boolean hasSomeJournalData() throws IOException {
    return zkPathExists(zkParentPath);
  }
 
  @Override
  public boolean hasSomeImageData() throws IOException {
    return false;
  }

  @Override
  public String toHTMLString() {
    return "BKJM journal";
  }

  @Override
  public boolean hasImageStorage() {
    return false;
  }

  @Override
  public RemoteStorageState analyzeJournalStorage() {
    // TODO
    return null;
  }
}
TOP

Related Classes of org.apache.hadoop.hdfs.server.namenode.bookkeeper.BookKeeperJournalManager$FencingLedgerHandleProvider

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.