Package org.apache.hadoop.hdfs

Source Code of org.apache.hadoop.hdfs.TestAvatarQJMFailures$TestAvatarQJMFailuresHandler

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;

import java.io.IOException;
import java.net.InetSocketAddress;

import java.net.URL;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.qjournal.MiniJournalCluster;
import org.apache.hadoop.hdfs.qjournal.protocol.JournalConfigKeys;
import org.apache.hadoop.hdfs.qjournal.server.Journal;
import org.apache.hadoop.hdfs.qjournal.server.JournalNode;
import org.apache.hadoop.hdfs.server.namenode.AvatarNode;
import org.apache.hadoop.hdfs.server.namenode.FSEditLog;
import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil.CheckpointTrigger;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.InjectionEventI;
import org.apache.hadoop.util.InjectionHandler;
import org.apache.log4j.Level;

import static org.mockito.Matchers.anyInt;
import static org.mockito.Mockito.doNothing;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;

import org.junit.After;
import org.junit.AfterClass;
import static org.junit.Assert.*;
import org.junit.BeforeClass;
import org.junit.Test;

public class TestAvatarQJMFailures {

  {
    ((Log4JLogger)Journal.LOG).getLogger().setLevel(Level.ALL);
  }
 
  private static final Log LOG = LogFactory.getLog(TestAvatarQJMFailures.class);
 
  private static final int WRITE_TXID_TIMEOUT = 1000;
 
  private Configuration conf;
  private MiniAvatarCluster cluster;
  private FileSystem fs;
  private MiniJournalCluster journalCluster;
 
  private TestAvatarQJMFailuresHandler handler;

  @BeforeClass
  public static void setUpBeforeClass() throws Exception {
    MiniAvatarCluster.createAndStartZooKeeper();
  }

  @AfterClass
  public static void tearDownAfterClass() throws Exception {
    MiniAvatarCluster.shutDownZooKeeper();
  }

  public void setUp(String name) throws Exception {
    setUp(new Configuration(), null, name);
  }

  public void setUp(Configuration confg, MiniJournalCluster jCluster,
      String name)
      throws Exception {
    LOG.info("START TEST : " + name);
   
    handler = new TestAvatarQJMFailuresHandler();
    InjectionHandler.set(handler);

    FSEditLog.setRuntimeForTesting(Runtime.getRuntime());
    conf = confg;
    if (jCluster == null) {
      cluster = new MiniAvatarCluster.Builder(conf).numDataNodes(1)
          .enableQJM(true).build();
    } else {
      cluster = new MiniAvatarCluster.Builder(conf).numDataNodes(1)
          .enableQJM(true).setJournalCluster(jCluster).build();
    }
    fs = cluster.getFileSystem();
    journalCluster = cluster.getJournalCluster();
   
  }

  @After
  public void tearDown() throws Exception {
    if (fs != null)
      fs.close();
    if (cluster != null)
      cluster.shutDown();
    InjectionHandler.clear();
  }

  public static class TestAvatarQJMFailuresHandler extends InjectionHandler {
   
    int currentJournalHttpPort = -1;
    volatile boolean simulateCrash = false;
    volatile boolean instantiatedIngest = false;
    final CheckpointTrigger checkpointTrigger = new CheckpointTrigger();
    volatile boolean simulateJNFailureForWrite = false;
    volatile int journalFailures = 0;
    volatile int currentJournalRPCPort = -1;
    volatile boolean simulateJournalHang = false;

    @Override
    protected void _processEvent(InjectionEventI event, Object... args) {
      if (event == InjectionEvent.QJM_URLLOGEDITLOGSTREAM_NEXTOP) {
        currentJournalHttpPort = ((URL) args[0]).getPort();
      } else if (event == InjectionEvent.INGEST_BEFORE_LOAD_EDIT) {
        instantiatedIngest = true;
      }
      checkpointTrigger.checkpointDone(event, args);
    }

    @Override
    protected void _processEventIO(InjectionEventI event, Object... args)
        throws IOException {
      if (event == InjectionEvent.QJM_JOURNALNODE_JOURNAL) {
        int port = ((InetSocketAddress) args[1]).getPort();
        if (port == currentJournalRPCPort) {
          if (simulateJNFailureForWrite) {
            journalFailures++;
            throw new IOException("Simulating journal failure");
          } else if (simulateJournalHang) {
            journalFailures++;
            // Hang long enough for the timeout.
            DFSTestUtil.waitNMilliSecond(WRITE_TXID_TIMEOUT * 5);
          }
        }
      }
    }

    @Override
    protected boolean _trueCondition(InjectionEventI event, Object... args) {
      if (event == InjectionEvent.FSNAMESYSTEM_CLOSE_DIRECTORY) {
        return !simulateCrash;
      }
      return true;
    }

    @Override
    protected boolean _falseCondition(InjectionEventI event, Object... args) {
      return checkpointTrigger.triggerCheckpoint(event);
    }
   
    void doCheckpoint() throws IOException {
      checkpointTrigger.doCheckpoint();
    }
  }

  /**
   * Tests that if a single journal node hangs, we can still proceed normally
   * with all operations.
   */
  @Test
  public void testJournalNodeHang() throws Exception {
    // Set a timeout of 1 second.
    Configuration conf = new Configuration();
    conf.setInt(JournalConfigKeys.DFS_QJOURNAL_WRITE_TXNS_TIMEOUT_KEY,
        WRITE_TXID_TIMEOUT);
    setUp(conf, null, "testJournalNodeHang");

    // Get the current journal port.
    JournalNode jn = getStandbyTailingJN();

    DFSTestUtil.createFile(fs, new Path("/test"), 1024, (short) 1, 0);
    handler.simulateJournalHang = true;

    // Ensure we can still write.
    DFSTestUtil.createFile(fs, new Path("/test1"), 1024, (short) 1, 0);

    // Ensure we can still do checkpoints.
    handler.doCheckpoint();
    DFSTestUtil.createFile(fs, new Path("/test2"), 1024, (short) 1, 0);

    // Verify the JN is lagging.
    assertFalse(cluster.getPrimaryAvatar(0).avatar.getLastWrittenTxId() == jn
        .getJournal((MiniAvatarCluster.JID + "/zero").getBytes())
        .getHighestWrittenTxId());

    // Ensure we can failover.
    cluster.failOver();
    cluster.restartStandby();

    // Ensure we can write after failover.
    DFSTestUtil.createFile(fs, new Path("/test3"), 1024, (short) 1, 0);

    // Ensure we can still do checkpoints.
    handler.doCheckpoint();

    // Ensure that we did end up hanging a journal node.
    DFSTestUtil.createFile(fs, new Path("/test4"), 1024, (short) 1, 0);
    assertTrue(handler.journalFailures > 0);
    assertFalse(cluster.getPrimaryAvatar(0).avatar.getLastWrittenTxId() == jn
        .getJournal((MiniAvatarCluster.JID + "/one").getBytes())
        .getHighestWrittenTxId());

    handler.simulateJournalHang = false;
  }


  private JournalNode getStandbyTailingJN() {
    assertTrue(handler.currentJournalHttpPort != -1);
    // Find the journal node the Standby is tailing from.
    JournalNode jn = null;
    for (JournalNode j : journalCluster.getJournalNodes()) {
      if (j.getBoundHttpAddress().getPort() == handler.currentJournalHttpPort) {
        jn = j;
        break;
      }
    }
    handler.currentJournalRPCPort = jn.getBoundIpcAddress().getPort();
    return jn;
  }

  /**
   * Tests that if the JournalNode from which the standby is tailing fails, the
   * Standby can still continue tailing from another JournalNode without issues.
   */
  @Test
  public void testStandbyTailingJNFailure() throws Exception {
    setUp("testStandbyTailingJNFailure");

    DFSTestUtil.createFile(fs, new Path("/test"), 1024, (short) 1, 0);
    AvatarNode standby = cluster.getStandbyAvatar(0).avatar;
    handler.doCheckpoint();

    // Find the journal node the Standby is tailing from.
    JournalNode jn = getStandbyTailingJN();
    assertNotNull(jn);

    jn.stopAndJoin(0);
    handler.instantiatedIngest = false;

    LOG.info("Stopped JournalNode with port "
        + jn.getBoundHttpAddress().getPort());
    LOG.info("current journal port : " + handler.currentJournalHttpPort);

    DFSTestUtil.createFile(fs, new Path("/test1"), 1024, (short) 1, 0);

    // Wait for ingest.
    long start = System.currentTimeMillis();
    while (!handler.instantiatedIngest
        && System.currentTimeMillis() - start < 10000) {
      DFSTestUtil.waitSecond();
    }

    assertTrue(handler.instantiatedIngest);

    LOG.info("Ingest failures : "
        + standby.getAvatarNodeMetrics().numIngestFailures.get());
    assertTrue(standby.getAvatarNodeMetrics().numIngestFailures
        .get() <= 1);
  }

  /**
   * This tests that if the journal node that the Standby is tailing from fails
   * only from the perspective of the primary avatar, the Standby should still
   * be able to switch to other journal nodes and continue ingesting.
   */
  @Test
  public void testStandbyTailingJNFailureForWrite() throws Exception {
    setUp("testStandbyTailingJNFailureForWrite");

    // Create some files and ensure we know which JN the Standby is tailing
    // from.
    DFSTestUtil.createFile(fs, new Path("/test"), 1024, (short) 1, 0);
    getStandbyTailingJN();

    // Now ensure the Primary excludes the JN the standby is tailing from.
    handler.simulateJNFailureForWrite = true;
    DFSTestUtil.createFile(fs, new Path("/test1"), 1024, (short) 1, 0);
    assertTrue(handler.journalFailures > 0);

    // Now ensure we can do a failover safely.
    cluster.failOver();
  }

  /**
   * Tests that we can still continue running if the journal cluster fails.
   */
  @Test
  public void testJournalClusterFailure() throws Exception {
    setUp("testJournalClusterFailure");
    DFSTestUtil.createFile(fs, new Path("/test"), 1024, (short) 1, 0);
    // Ensure standby is down, otherwise it would cause a System.exit() because
    // of failed checkpointing.
    cluster.killStandby();
    journalCluster.shutdown();
    DFSTestUtil.createFile(fs, new Path("/test1"), 1024, (short) 1, 0);
  }

  /**
   * Tests that if the journal cluster is marked as required, we cannot
   * tolerate its failure.
   */
  @Test
  public void testJournalClusterFailureWhenRequired() throws Exception {
    Configuration conf = new Configuration();
    journalCluster = new MiniJournalCluster.Builder(conf).numJournalNodes(
        3).build();
    String journalURI = journalCluster.getQuorumJournalURI(
        MiniAvatarCluster.JID).toString();
    conf.set("dfs.name.edits.dir.required", journalURI + "/zero," + journalURI
        + "/one");

    setUp(conf, journalCluster, "testJournalClusterFailureWhenRequired");

    // spy the runtime
    Runtime runtime = Runtime.getRuntime();
    runtime = spy(runtime);
    doNothing().when(runtime).exit(anyInt());
    FSEditLog.setRuntimeForTesting(runtime);


    // Kill standby to ensure only 1 runtime.exit();
    cluster.killStandby();
    journalCluster.shutdown();
    fs.create(new Path("/test1"));

    // verify failure.
    verify(runtime, times(1)).exit(anyInt());
  }

  /**
   * Tests that if the primary crashes multiple times and restarts resulting in
   * multiple logs with no END_LOG_SEGMENT, then the Standby can still
   * successfully ingest these logs.
   */
  @Test
  public void testMultiplePrimaryCrash() throws Exception {
    setUp("testMultiplePrimaryCrash");
    handler.simulateCrash = true;

    for (int i = 0; i < 3; i++) {
      // Create a few unfinalized segments.
      DFSTestUtil.createFile(fs, new Path("/test" + i), 1024, (short) 1, 0);
      cluster.killPrimary();
      cluster.getNameNode(0).unlockStorageDirectory("zero");
      cluster.getNameNode(0).unlockStorageDirectory("one");
      cluster.restartAvatarNodes();
    }

    DFSTestUtil.createFile(fs, new Path("/finaltest"), 1024, (short) 1, 0);

    long lastTxid = cluster.getPrimaryAvatar(0).avatar.getLastWrittenTxId();

    // Force failover since Primary is in simulate crash mode.
    cluster.failOver(true);

    // Standby will have one more txid after failover due to START_LOG_SEGMENT.
    assertEquals(lastTxid + 1,
        cluster.getPrimaryAvatar(0).avatar.getLastWrittenTxId());
  }
}
TOP

Related Classes of org.apache.hadoop.hdfs.TestAvatarQJMFailures$TestAvatarQJMFailuresHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.