/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.resourcemanager;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.ha.HAServiceProtocol;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
import org.apache.hadoop.ha.HealthCheckFailedException;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.conf.HAUtil;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
public class TestRMHA {
private Log LOG = LogFactory.getLog(TestRMHA.class);
private final Configuration configuration = new YarnConfiguration();
private MockRM rm = null;
private static final String STATE_ERR =
"ResourceManager is in wrong HA state";
private static final String RM1_ADDRESS = "0.0.0.0:0";
private static final String RM1_NODE_ID = "rm1";
private static final String RM2_ADDRESS = "1.1.1.1:1";
private static final String RM2_NODE_ID = "rm2";
@Before
public void setUp() throws Exception {
configuration.setBoolean(YarnConfiguration.RM_HA_ENABLED, true);
configuration.set(YarnConfiguration.RM_HA_IDS, RM1_NODE_ID + "," + RM2_NODE_ID);
for (String confKey : YarnConfiguration.RM_SERVICES_ADDRESS_CONF_KEYS) {
configuration.set(HAUtil.addSuffix(confKey, RM1_NODE_ID), RM1_ADDRESS);
configuration.set(HAUtil.addSuffix(confKey, RM2_NODE_ID), RM2_ADDRESS);
}
configuration.set(YarnConfiguration.RM_HA_ID, RM1_NODE_ID);
}
private void checkMonitorHealth() throws IOException {
try {
rm.adminService.monitorHealth();
} catch (HealthCheckFailedException e) {
fail("The RM is in bad health: it is Active, but the active services " +
"are not running");
}
}
private void checkStandbyRMFunctionality() throws IOException {
assertEquals(STATE_ERR, HAServiceState.STANDBY,
rm.adminService.getServiceStatus().getState());
assertFalse("Active RM services are started",
rm.areActiveServicesRunning());
assertTrue("RM is not ready to become active",
rm.adminService.getServiceStatus().isReadyToBecomeActive());
}
private void checkActiveRMFunctionality() throws IOException {
assertEquals(STATE_ERR, HAServiceState.ACTIVE,
rm.adminService.getServiceStatus().getState());
assertTrue("Active RM services aren't started",
rm.areActiveServicesRunning());
assertTrue("RM is not ready to become active",
rm.adminService.getServiceStatus().isReadyToBecomeActive());
try {
rm.getNewAppId();
rm.registerNode("127.0.0.1:0", 2048);
rm.submitApp(1024);
} catch (Exception e) {
fail("Unable to perform Active RM functions");
LOG.error("ActiveRM check failed", e);
}
}
/**
* Test to verify the following RM HA transitions to the following states.
* 1. Standby: Should be a no-op
* 2. Active: Active services should start
* 3. Active: Should be a no-op.
* While active, submit a couple of jobs
* 4. Standby: Active services should stop
* 5. Active: Active services should start
* 6. Stop the RM: All services should stop and RM should not be ready to
* become Active
*/
@Test (timeout = 30000)
public void testStartAndTransitions() throws IOException {
Configuration conf = new YarnConfiguration(configuration);
rm = new MockRM(conf);
rm.init(conf);
StateChangeRequestInfo requestInfo = new StateChangeRequestInfo(
HAServiceProtocol.RequestSource.REQUEST_BY_USER);
assertEquals(STATE_ERR, HAServiceState.INITIALIZING,
rm.adminService.getServiceStatus().getState());
assertFalse("RM is ready to become active before being started",
rm.adminService.getServiceStatus().isReadyToBecomeActive());
checkMonitorHealth();
rm.start();
checkMonitorHealth();
checkStandbyRMFunctionality();
// 1. Transition to Standby - must be a no-op
rm.adminService.transitionToStandby(requestInfo);
checkMonitorHealth();
checkStandbyRMFunctionality();
// 2. Transition to active
rm.adminService.transitionToActive(requestInfo);
checkMonitorHealth();
checkActiveRMFunctionality();
// 3. Transition to active - no-op
rm.adminService.transitionToActive(requestInfo);
checkMonitorHealth();
checkActiveRMFunctionality();
// 4. Transition to standby
rm.adminService.transitionToStandby(requestInfo);
checkMonitorHealth();
checkStandbyRMFunctionality();
// 5. Transition to active to check Active->Standby->Active works
rm.adminService.transitionToActive(requestInfo);
checkMonitorHealth();
checkActiveRMFunctionality();
// 6. Stop the RM. All services should stop and RM should not be ready to
// become active
rm.stop();
assertEquals(STATE_ERR, HAServiceState.STOPPING,
rm.adminService.getServiceStatus().getState());
assertFalse("RM is ready to become active even after it is stopped",
rm.adminService.getServiceStatus().isReadyToBecomeActive());
assertFalse("Active RM services are started",
rm.areActiveServicesRunning());
checkMonitorHealth();
}
@Test
public void testTransitionsWhenAutomaticFailoverEnabled() throws IOException {
final String ERR_UNFORCED_REQUEST = "User request succeeded even when " +
"automatic failover is enabled";
Configuration conf = new YarnConfiguration(configuration);
conf.setBoolean(YarnConfiguration.AUTO_FAILOVER_ENABLED, true);
rm = new MockRM(conf);
rm.init(conf);
rm.start();
StateChangeRequestInfo requestInfo = new StateChangeRequestInfo(
HAServiceProtocol.RequestSource.REQUEST_BY_USER);
// Transition to standby
try {
rm.adminService.transitionToStandby(requestInfo);
fail(ERR_UNFORCED_REQUEST);
} catch (AccessControlException e) {
// expected
}
checkMonitorHealth();
checkStandbyRMFunctionality();
// Transition to active
try {
rm.adminService.transitionToActive(requestInfo);
fail(ERR_UNFORCED_REQUEST);
} catch (AccessControlException e) {
// expected
}
checkMonitorHealth();
checkStandbyRMFunctionality();
final String ERR_FORCED_REQUEST = "Forced request by user should work " +
"even if automatic failover is enabled";
requestInfo = new StateChangeRequestInfo(
HAServiceProtocol.RequestSource.REQUEST_BY_USER_FORCED);
// Transition to standby
try {
rm.adminService.transitionToStandby(requestInfo);
} catch (AccessControlException e) {
fail(ERR_FORCED_REQUEST);
}
checkMonitorHealth();
checkStandbyRMFunctionality();
// Transition to active
try {
rm.adminService.transitionToActive(requestInfo);
} catch (AccessControlException e) {
fail(ERR_FORCED_REQUEST);
}
checkMonitorHealth();
checkActiveRMFunctionality();
}
@Test
public void testRMDispatcherForHA() throws IOException {
String errorMessageForEventHandler =
"Expect to get the same number of handlers";
String errorMessageForService = "Expect to get the same number of services";
Configuration conf = new YarnConfiguration(configuration);
rm = new MockRM(conf) {
@Override
protected Dispatcher createDispatcher() {
return new MyCountingDispatcher();
}
};
rm.init(conf);
int expectedEventHandlerCount =
((MyCountingDispatcher) rm.getRMContext().getDispatcher())
.getEventHandlerCount();
int expectedServiceCount = rm.getServices().size();
assertTrue(expectedEventHandlerCount != 0);
StateChangeRequestInfo requestInfo = new StateChangeRequestInfo(
HAServiceProtocol.RequestSource.REQUEST_BY_USER);
assertEquals(STATE_ERR, HAServiceState.INITIALIZING,
rm.adminService.getServiceStatus().getState());
assertFalse("RM is ready to become active before being started",
rm.adminService.getServiceStatus().isReadyToBecomeActive());
rm.start();
//call transitions to standby and active a couple of times
rm.adminService.transitionToStandby(requestInfo);
rm.adminService.transitionToActive(requestInfo);
rm.adminService.transitionToStandby(requestInfo);
rm.adminService.transitionToActive(requestInfo);
rm.adminService.transitionToStandby(requestInfo);
rm.adminService.transitionToActive(requestInfo);
assertEquals(errorMessageForEventHandler, expectedEventHandlerCount,
((MyCountingDispatcher) rm.getRMContext().getDispatcher())
.getEventHandlerCount());
assertEquals(errorMessageForService, expectedServiceCount,
rm.getServices().size());
rm.adminService.transitionToStandby(requestInfo);
assertEquals(errorMessageForEventHandler, expectedEventHandlerCount,
((MyCountingDispatcher) rm.getRMContext().getDispatcher())
.getEventHandlerCount());
assertEquals(errorMessageForService, expectedServiceCount,
rm.getServices().size());
rm.stop();
}
@SuppressWarnings("rawtypes")
class MyCountingDispatcher extends AbstractService implements Dispatcher {
private int eventHandlerCount;
public MyCountingDispatcher() {
super("MyCountingDispatcher");
this.eventHandlerCount = 0;
}
@Override
public EventHandler getEventHandler() {
return null;
}
@Override
public void register(Class<? extends Enum> eventType, EventHandler handler) {
this.eventHandlerCount ++;
}
public int getEventHandlerCount() {
return this.eventHandlerCount;
}
}
}