Package com.cloud.cluster

Source Code of com.cloud.cluster.ClusterManagerImpl

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
package com.cloud.cluster;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.ConnectException;
import java.net.InetSocketAddress;
import java.nio.channels.SocketChannel;
import java.rmi.RemoteException;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.SQLRecoverableException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

import javax.ejb.Local;
import javax.naming.ConfigurationException;

import org.apache.log4j.Logger;

import com.cloud.agent.AgentManager;
import com.cloud.agent.AgentManager.OnError;
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.ChangeAgentAnswer;
import com.cloud.agent.api.ChangeAgentCommand;
import com.cloud.agent.api.Command;
import com.cloud.agent.api.PropagateResourceEventCommand;
import com.cloud.agent.api.TransferAgentCommand;
import com.cloud.agent.manager.Commands;
import com.cloud.cluster.agentlb.dao.HostTransferMapDao;
import com.cloud.cluster.dao.ManagementServerHostDao;
import com.cloud.cluster.dao.ManagementServerHostPeerDao;
import com.cloud.configuration.Config;
import com.cloud.configuration.dao.ConfigurationDao;
import com.cloud.exception.AgentUnavailableException;
import com.cloud.exception.OperationTimedoutException;
import com.cloud.host.Host;
import com.cloud.host.HostVO;
import com.cloud.host.Status.Event;
import com.cloud.host.dao.HostDao;
import com.cloud.resource.ResourceManager;
import com.cloud.resource.ResourceState;
import com.cloud.serializer.GsonHelper;
import com.cloud.utils.DateUtil;
import com.cloud.utils.NumbersUtil;
import com.cloud.utils.Profiler;
import com.cloud.utils.PropertiesUtil;
import com.cloud.utils.component.Adapters;
import com.cloud.utils.component.ComponentLocator;
import com.cloud.utils.component.Inject;
import com.cloud.utils.concurrency.NamedThreadFactory;
import com.cloud.utils.db.ConnectionConcierge;
import com.cloud.utils.db.DB;
import com.cloud.utils.db.SearchCriteria.Op;
import com.cloud.utils.db.SearchCriteria2;
import com.cloud.utils.db.SearchCriteriaService;
import com.cloud.utils.db.Transaction;
import com.cloud.utils.events.SubscriptionMgr;
import com.cloud.utils.exception.CloudRuntimeException;
import com.cloud.utils.exception.ExceptionUtil;
import com.cloud.utils.mgmt.JmxUtil;
import com.cloud.utils.net.NetUtils;
import com.google.gson.Gson;

@Local(value = { ClusterManager.class })
public class ClusterManagerImpl implements ClusterManager {
    private static final Logger s_logger = Logger.getLogger(ClusterManagerImpl.class);

    private static final int EXECUTOR_SHUTDOWN_TIMEOUT = 1000; // 1 second
    private static final int DEFAULT_OUTGOING_WORKERS = 5;

    private final List<ClusterManagerListener> _listeners = new ArrayList<ClusterManagerListener>();
    private final Map<Long, ManagementServerHostVO> _activePeers = new HashMap<Long, ManagementServerHostVO>();
    private int _heartbeatInterval = ClusterManager.DEFAULT_HEARTBEAT_INTERVAL;
    private int _heartbeatThreshold = ClusterManager.DEFAULT_HEARTBEAT_THRESHOLD;

    private final Map<String, ClusterService> _clusterPeers;
    private final Gson _gson;

    @Inject
    private AgentManager _agentMgr;
    @Inject
    private ClusteredAgentRebalanceService _rebalanceService;
    @Inject
    private ResourceManager _resourceMgr;

    private final ScheduledExecutorService _heartbeatScheduler = Executors.newScheduledThreadPool(1, new NamedThreadFactory("Cluster-Heartbeat"));
    private final ExecutorService _notificationExecutor = Executors.newFixedThreadPool(1, new NamedThreadFactory("Cluster-Notification"));
    private final List<ClusterManagerMessage> _notificationMsgs = new ArrayList<ClusterManagerMessage>();
    private ConnectionConcierge _heartbeatConnection = null;

    private final ExecutorService _executor;

    private ClusterServiceAdapter _currentServiceAdapter;

    private ManagementServerHostDao _mshostDao;
    private ManagementServerHostPeerDao _mshostPeerDao;
    private HostDao _hostDao;
    private HostTransferMapDao _hostTransferDao;

    //
    // pay attention to _mshostId and _msid
    // _mshostId is the primary key of management host table
    // _msid is the unique persistent identifier that peer name is based upon
    //
    private Long _mshostId = null;
    protected long _msId = ManagementServerNode.getManagementServerId();
    protected long _runId = System.currentTimeMillis();

    private boolean _peerScanInited = false;

    private String _name;
    private String _clusterNodeIP = "127.0.0.1";
    private boolean _agentLBEnabled = false;
    private double _connectedAgentsThreshold = 0.7;
    private static boolean _agentLbHappened = false;
   
    private List<ClusterServicePdu> _clusterPduOutgoingQueue = new ArrayList<ClusterServicePdu>();
    private List<ClusterServicePdu> _clusterPduIncomingQueue = new ArrayList<ClusterServicePdu>();
    private Map<Long, ClusterServiceRequestPdu> _outgoingPdusWaitingForAck = new HashMap<Long, ClusterServiceRequestPdu>();
   
    public ClusterManagerImpl() {
        _clusterPeers = new HashMap<String, ClusterService>();

        _gson = GsonHelper.getGson();

        // executor to perform remote-calls in another thread context, to avoid potential
        // recursive remote calls between nodes
        //
        _executor = Executors.newCachedThreadPool(new NamedThreadFactory("Cluster-Worker"));
    }
   
    private void registerRequestPdu(ClusterServiceRequestPdu pdu) {
        synchronized(_outgoingPdusWaitingForAck) {
            _outgoingPdusWaitingForAck.put(pdu.getSequenceId(), pdu);
        }
    }
   
    private ClusterServiceRequestPdu popRequestPdu(long ackSequenceId) {
        synchronized(_outgoingPdusWaitingForAck) {
            if(_outgoingPdusWaitingForAck.get(ackSequenceId) != null) {
                ClusterServiceRequestPdu pdu = _outgoingPdusWaitingForAck.get(ackSequenceId);
                _outgoingPdusWaitingForAck.remove(ackSequenceId);
                return pdu;
            }
        }
       
        return null;
    }
   
    private void cancelClusterRequestToPeer(String strPeer) {
        List<ClusterServiceRequestPdu> candidates = new ArrayList<ClusterServiceRequestPdu>();
        synchronized(_outgoingPdusWaitingForAck) {
            for(Map.Entry<Long, ClusterServiceRequestPdu> entry : _outgoingPdusWaitingForAck.entrySet()) {
                if(entry.getValue().getDestPeer().equalsIgnoreCase(strPeer))
                    candidates.add(entry.getValue());
            }

            for(ClusterServiceRequestPdu pdu : candidates) {
                _outgoingPdusWaitingForAck.remove(pdu.getSequenceId());
            }
        }
       
        for(ClusterServiceRequestPdu pdu : candidates) {
            s_logger.warn("Cancel cluster request PDU to peer: " + strPeer + ", pdu: " + _gson.toJson(pdu));
            synchronized(pdu) {
                pdu.notifyAll();
            }
        }
    }
   
    private void addOutgoingClusterPdu(ClusterServicePdu pdu) {
      synchronized(_clusterPduOutgoingQueue) {
        _clusterPduOutgoingQueue.add(pdu);
        _clusterPduOutgoingQueue.notifyAll();
      }
    }
   
    private ClusterServicePdu popOutgoingClusterPdu(long timeoutMs) {
      synchronized(_clusterPduOutgoingQueue) {
        try {
        _clusterPduOutgoingQueue.wait(timeoutMs);
      } catch (InterruptedException e) {
      }
     
      if(_clusterPduOutgoingQueue.size() > 0) {
        ClusterServicePdu pdu = _clusterPduOutgoingQueue.get(0);
        _clusterPduOutgoingQueue.remove(0);
        return pdu;
      }
      }
      return null;
    }

    private void addIncomingClusterPdu(ClusterServicePdu pdu) {
      synchronized(_clusterPduIncomingQueue) {
        _clusterPduIncomingQueue.add(pdu);
        _clusterPduIncomingQueue.notifyAll();
      }
    }
   
    private ClusterServicePdu popIncomingClusterPdu(long timeoutMs) {
      synchronized(_clusterPduIncomingQueue) {
        try {
          _clusterPduIncomingQueue.wait(timeoutMs);
      } catch (InterruptedException e) {
      }
     
      if(_clusterPduIncomingQueue.size() > 0) {
        ClusterServicePdu pdu = _clusterPduIncomingQueue.get(0);
        _clusterPduIncomingQueue.remove(0);
        return pdu;
      }
      }
      return null;
    }
   
    private Runnable getClusterPduSendingTask() {
        return new Runnable() {
            public void run() {
                onSendingClusterPdu();
            }
        };
    }
   
    private Runnable getClusterPduNotificationTask() {
        return new Runnable() {
            public void run() {
                onNotifyingClusterPdu();
            }
        };
    }
   
    private void onSendingClusterPdu() {
        while(true) {
            try {
                ClusterServicePdu pdu = popOutgoingClusterPdu(1000);
                if(pdu == null)
                  continue;
                 
                ClusterService peerService =  null;
                for(int i = 0; i < 2; i++) {
                    try {
                        peerService = getPeerService(pdu.getDestPeer());
                    } catch (RemoteException e) {
                        s_logger.error("Unable to get cluster service on peer : " + pdu.getDestPeer());
                    }

                    if(peerService != null) {
                        try {
                            if(s_logger.isDebugEnabled()) {
                                s_logger.debug("Cluster PDU " + getSelfPeerName() + " -> " + pdu.getDestPeer() + ". agent: " + pdu.getAgentId()
                                    + ", pdu seq: " + pdu.getSequenceId() + ", pdu ack seq: " + pdu.getAckSequenceId() + ", json: " + pdu.getJsonPackage());
                            }

                            long startTick = System.currentTimeMillis();
                            String strResult = peerService.execute(pdu);
                            if(s_logger.isDebugEnabled()) {
                                s_logger.debug("Cluster PDU " + getSelfPeerName() + " -> " + pdu.getDestPeer() + " completed. time: " +
                                    (System.currentTimeMillis() - startTick) + "ms. agent: " + pdu.getAgentId()
                                     + ", pdu seq: " + pdu.getSequenceId() + ", pdu ack seq: " + pdu.getAckSequenceId() + ", json: " + pdu.getJsonPackage());
                            }
                           
                            if("true".equals(strResult))
                                break;
                           
                        } catch (RemoteException e) {
                            invalidatePeerService(pdu.getDestPeer());
                            if(s_logger.isInfoEnabled()) {
                                s_logger.info("Exception on remote execution, peer: " + pdu.getDestPeer() + ", iteration: "
                                        + i + ", exception message :" + e.getMessage());
                            }
                        }
                    }
                }
            } catch(Throwable e) {
                s_logger.error("Unexcpeted exception: ", e);
            }
        }
    }
   
    private void onNotifyingClusterPdu() {
        while(true) {
            try {
                final ClusterServicePdu pdu = popIncomingClusterPdu(1000);
                if(pdu == null)
                  continue;

                _executor.execute(new Runnable() {
                  public void run() {
                    if(pdu.getPduType() == ClusterServicePdu.PDU_TYPE_RESPONSE) {
                        ClusterServiceRequestPdu requestPdu = popRequestPdu(pdu.getAckSequenceId());
                        if(requestPdu != null) {
                            requestPdu.setResponseResult(pdu.getJsonPackage());
                            synchronized(requestPdu) {
                                requestPdu.notifyAll();
                            }
                        } else {
                            s_logger.warn("Original request has already been cancelled. pdu: " + _gson.toJson(pdu));
                        }
                    } else {
                        String result = dispatchClusterServicePdu(pdu);
                        if(result == null)
                            result = "";
                       
                        if(pdu.getPduType() == ClusterServicePdu.PDU_TYPE_REQUEST) {
                          ClusterServicePdu responsePdu = new ClusterServicePdu();
                          responsePdu.setPduType(ClusterServicePdu.PDU_TYPE_RESPONSE);
                          responsePdu.setSourcePeer(pdu.getDestPeer());
                          responsePdu.setDestPeer(pdu.getSourcePeer());
                          responsePdu.setAckSequenceId(pdu.getSequenceId());
                          responsePdu.setJsonPackage(result);
                         
                          addOutgoingClusterPdu(responsePdu);
                        }
                    }
                  }
                });
            } catch(Throwable e) {
                s_logger.error("Unexcpeted exception: ", e);
            }
        }
    }
   
    private String dispatchClusterServicePdu(ClusterServicePdu pdu) {

        if(s_logger.isDebugEnabled()) {
            s_logger.debug("Dispatch ->" + pdu.getAgentId() + ", json: " + pdu.getJsonPackage());
        }

        Command [] cmds = null;
        try {
            cmds = _gson.fromJson(pdu.getJsonPackage(), Command[].class);
        } catch(Throwable e) {
            assert(false);
            s_logger.error("Excection in gson decoding : ", e);
        }
       
        if (cmds.length == 1 && cmds[0] instanceof ChangeAgentCommand) {  //intercepted
            ChangeAgentCommand cmd = (ChangeAgentCommand)cmds[0];

            if (s_logger.isDebugEnabled()) {
                s_logger.debug("Intercepting command for agent change: agent " + cmd.getAgentId() + " event: " + cmd.getEvent());
            }
            boolean result = false;
            try {
                result = executeAgentUserRequest(cmd.getAgentId(), cmd.getEvent());
                if (s_logger.isDebugEnabled()) {
                    s_logger.debug("Result is " + result);
                }

            } catch (AgentUnavailableException e) {
                s_logger.warn("Agent is unavailable", e);
                return null;
            }

            Answer[] answers = new Answer[1];
            answers[0] = new ChangeAgentAnswer(cmd, result);
            return _gson.toJson(answers);
        } else if (cmds.length == 1 && cmds[0] instanceof TransferAgentCommand) {
            TransferAgentCommand cmd = (TransferAgentCommand) cmds[0];

            if (s_logger.isDebugEnabled()) {
                s_logger.debug("Intercepting command for agent rebalancing: agent " + cmd.getAgentId() + " event: " + cmd.getEvent());
            }
            boolean result = false;
            try {
                result = rebalanceAgent(cmd.getAgentId(), cmd.getEvent(), cmd.getCurrentOwner(), cmd.getFutureOwner());
                if (s_logger.isDebugEnabled()) {
                    s_logger.debug("Result is " + result);
                }

            } catch (AgentUnavailableException e) {
                s_logger.warn("Agent is unavailable", e);
                return null;
            } catch (OperationTimedoutException e) {
                s_logger.warn("Operation timed out", e);
                return null;
            }
            Answer[] answers = new Answer[1];
            answers[0] = new Answer(cmd, result, null);
            return _gson.toJson(answers);
        }

        try {
            long startTick = System.currentTimeMillis();
            if(s_logger.isDebugEnabled()) {
                s_logger.debug("Dispatch -> " + pdu.getAgentId() + ", json: " + pdu.getJsonPackage());
            }

            Answer[] answers = sendToAgent(pdu.getAgentId(), cmds, pdu.isStopOnError());
            if(answers != null) {
                String jsonReturn =  _gson.toJson(answers);

                if(s_logger.isDebugEnabled()) {
                    s_logger.debug("Completed dispatching -> " + pdu.getAgentId() + ", json: " + pdu.getJsonPackage() +
                            " in " + (System.currentTimeMillis() - startTick) + " ms, return result: " + jsonReturn);
                }

                return jsonReturn;
            } else {
                if(s_logger.isDebugEnabled()) {
                    s_logger.debug("Completed dispatching -> " + pdu.getAgentId() + ", json: " + pdu.getJsonPackage() +
                            " in " + (System.currentTimeMillis() - startTick) + " ms, return null result");
                }
            }
        } catch(AgentUnavailableException e) {
            s_logger.warn("Agent is unavailable", e);
        } catch (OperationTimedoutException e) {
            s_logger.warn("Timed Out", e);
        }
       
        return null;
    }

    public void OnReceiveClusterServicePdu(ClusterServicePdu pdu) {
      addIncomingClusterPdu(pdu);
    }
   
    @Override
    public Answer[] sendToAgent(Long hostId, Command[] cmds, boolean stopOnError) throws AgentUnavailableException, OperationTimedoutException {
        Commands commands = new Commands(stopOnError ? OnError.Stop : OnError.Continue);
        for (Command cmd : cmds) {
            commands.addCommand(cmd);
        }
        return _agentMgr.send(hostId, commands);
    }

    @Override
    public boolean executeAgentUserRequest(long agentId, Event event) throws AgentUnavailableException {
        return _agentMgr.executeUserRequest(agentId, event);
    }

    @Override
    public Boolean propagateAgentEvent(long agentId, Event event) throws AgentUnavailableException {
        final String msPeer = getPeerName(agentId);
        if (msPeer == null) {
            return null;
        }

        if (s_logger.isDebugEnabled()) {
            s_logger.debug("Propagating agent change request event:" + event.toString() + " to agent:" + agentId);
        }
        Command[] cmds = new Command[1];
        cmds[0] = new ChangeAgentCommand(agentId, event);

        Answer[] answers = execute(msPeer, agentId, cmds, true);
        if (answers == null) {
            throw new AgentUnavailableException(agentId);
        }

        if (s_logger.isDebugEnabled()) {
            s_logger.debug("Result for agent change is " + answers[0].getResult());
        }

        return answers[0].getResult();
    }

    /**
     * called by DatabaseUpgradeChecker to see if there are other peers running.
     *
     * @param notVersion
     *            If version is passed in, the peers CANNOT be running at this version. If version is null, return true if any
     *            peer is running regardless of version.
     * @return true if there are peers running and false if not.
     */
    public static final boolean arePeersRunning(String notVersion) {
        return false; // TODO: Leaving this for Kelven to take care of.
    }

    @Override
    public void broadcast(long agentId, Command[] cmds) {
        Date cutTime = DateUtil.currentGMTTime();

        List<ManagementServerHostVO> peers = _mshostDao.getActiveList(new Date(cutTime.getTime() - _heartbeatThreshold));
        for (ManagementServerHostVO peer : peers) {
            String peerName = Long.toString(peer.getMsid());
            if (getSelfPeerName().equals(peerName)) {
                continue; // Skip myself.
            }
            try {
                if (s_logger.isDebugEnabled()) {
                    s_logger.debug("Forwarding " + cmds[0].toString() + " to " + peer.getMsid());
                }
                executeAsync(peerName, agentId, cmds, true);
            } catch (Exception e) {
                s_logger.warn("Caught exception while talkign to " + peer.getMsid());
            }
        }
    }

    @Override
    public void executeAsync(String strPeer, long agentId, Command [] cmds, boolean stopOnError) {
        ClusterServicePdu pdu = new ClusterServicePdu();
        pdu.setSourcePeer(getSelfPeerName());
        pdu.setDestPeer(strPeer);
        pdu.setAgentId(agentId);
        pdu.setJsonPackage(_gson.toJson(cmds, Command[].class));
        pdu.setStopOnError(true);
        addOutgoingClusterPdu(pdu);
    }

    @Override
    public Answer[] execute(String strPeer, long agentId, Command [] cmds, boolean stopOnError) {
        if(s_logger.isDebugEnabled()) {
            s_logger.debug(getSelfPeerName() + " -> " + strPeer + "." + agentId + " " +
                    _gson.toJson(cmds, Command[].class));
        }
       
        ClusterServiceRequestPdu pdu = new ClusterServiceRequestPdu();
        pdu.setSourcePeer(getSelfPeerName());
        pdu.setDestPeer(strPeer);
        pdu.setAgentId(agentId);
        pdu.setJsonPackage(_gson.toJson(cmds, Command[].class));
        pdu.setStopOnError(stopOnError);
        registerRequestPdu(pdu);
        addOutgoingClusterPdu(pdu);
       
        synchronized(pdu) {
            try {
                pdu.wait();
            } catch (InterruptedException e) {
            }
        }

        if(s_logger.isDebugEnabled()) {
            s_logger.debug(getSelfPeerName() + " -> " + strPeer + "." + agentId + " completed. result: " +
                pdu.getResponseResult());
        }
       
        if(pdu.getResponseResult() != null && pdu.getResponseResult().length() > 0) {
            try {
                return _gson.fromJson(pdu.getResponseResult(), Answer[].class);
            } catch(Throwable e) {
                s_logger.error("Exception on parsing gson package from remote call to " + strPeer);
            }
        }

        return null;
    }
   
    @Override
    public String getPeerName(long agentHostId) {

        HostVO host = _hostDao.findById(agentHostId);
        if(host != null && host.getManagementServerId() != null) {
            if(getSelfPeerName().equals(Long.toString(host.getManagementServerId()))) {
                return null;
            }

            return Long.toString(host.getManagementServerId());
        }
        return null;
    }

    @Override
    public ManagementServerHostVO getPeer(String mgmtServerId) {
        return _mshostDao.findByMsid(Long.valueOf(mgmtServerId));
    }

    @Override
    public String getSelfPeerName() {
        return Long.toString(_msId);
    }

    @Override
    public String getSelfNodeIP() {
        return _clusterNodeIP;
    }

    @Override
    public void registerListener(ClusterManagerListener listener) {
        // Note : we don't check duplicates
        synchronized (_listeners) {

        s_logger.info("register cluster listener " + listener.getClass());
       
          _listeners.add(listener);
        }
    }

    @Override
    public void unregisterListener(ClusterManagerListener listener) {
        synchronized(_listeners) {
        s_logger.info("unregister cluster listener " + listener.getClass());
         
          _listeners.remove(listener);
        }
    }

    public void notifyNodeJoined(List<ManagementServerHostVO> nodeList) {
        if(s_logger.isDebugEnabled()) {
            s_logger.debug("Notify management server node join to listeners.");

            for(ManagementServerHostVO mshost : nodeList) {
                s_logger.debug("Joining node, IP: " + mshost.getServiceIP() + ", msid: " + mshost.getMsid());
            }
        }

        synchronized(_listeners) {
            for(ClusterManagerListener listener : _listeners) {
                listener.onManagementNodeJoined(nodeList, _mshostId);
            }
        }

        SubscriptionMgr.getInstance().notifySubscribers(ClusterManager.ALERT_SUBJECT, this,
                new ClusterNodeJoinEventArgs(_mshostId, nodeList));
    }

    public void notifyNodeLeft(List<ManagementServerHostVO> nodeList) {
        if(s_logger.isDebugEnabled()) {
            s_logger.debug("Notify management server node left to listeners.");
        }
       
        for(ManagementServerHostVO mshost : nodeList) {
            if(s_logger.isDebugEnabled())
                s_logger.debug("Leaving node, IP: " + mshost.getServiceIP() + ", msid: " + mshost.getMsid());
            cancelClusterRequestToPeer(String.valueOf(mshost.getMsid()));
        }

        synchronized(_listeners) {
            for(ClusterManagerListener listener : _listeners) {
                listener.onManagementNodeLeft(nodeList, _mshostId);
            }
        }

        SubscriptionMgr.getInstance().notifySubscribers(ClusterManager.ALERT_SUBJECT, this,
                new ClusterNodeLeftEventArgs(_mshostId, nodeList));
    }

    public void notifyNodeIsolated() {
        if(s_logger.isDebugEnabled())
            s_logger.debug("Notify management server node isolation to listeners");

        synchronized(_listeners) {
            for(ClusterManagerListener listener : _listeners) {
                listener.onManagementNodeIsolated();
            }
        }
    }

    public ClusterService getPeerService(String strPeer) throws RemoteException {
        synchronized(_clusterPeers) {
            if(_clusterPeers.containsKey(strPeer)) {
                return _clusterPeers.get(strPeer);
            }
        }

        ClusterService service = _currentServiceAdapter.getPeerService(strPeer);

        if(service != null) {
            synchronized(_clusterPeers) {
                // re-check the peer map again to deal with the
                // race conditions
                if(!_clusterPeers.containsKey(strPeer)) {
                    _clusterPeers.put(strPeer, service);
                }
            }
        }

        return service;
    }

    public void invalidatePeerService(String strPeer) {
        synchronized(_clusterPeers) {
            if(_clusterPeers.containsKey(strPeer)) {
                _clusterPeers.remove(strPeer);
            }
        }
    }

    private Runnable getHeartbeatTask() {
        return new Runnable() {
            @Override
            public void run() {
                Transaction txn = Transaction.open("ClusterHeartBeat");
                try {
                    Profiler profiler = new Profiler();
                    Profiler profilerHeartbeatUpdate = new Profiler();
                    Profiler profilerPeerScan = new Profiler();
                    Profiler profilerAgentLB = new Profiler();
                   
                    try {
                        profiler.start();
                       
                        profilerHeartbeatUpdate.start();
                        txn.transitToUserManagedConnection(getHeartbeatConnection());
                        if(s_logger.isTraceEnabled()) {
                            s_logger.trace("Cluster manager heartbeat update, id:" + _mshostId);
                        }
   
                        _mshostDao.update(_mshostId, getCurrentRunId(), DateUtil.currentGMTTime());
                        profilerHeartbeatUpdate.stop();
   
                        profilerPeerScan.start();
                        if (s_logger.isTraceEnabled()) {
                            s_logger.trace("Cluster manager peer-scan, id:" + _mshostId);
                        }
   
                        if (!_peerScanInited) {
                            _peerScanInited = true;
                            initPeerScan();
                        }
                       
                        peerScan();
                        profilerPeerScan.stop();
                       
                        profilerAgentLB.start();
                        //initiate agent lb task will be scheduled and executed only once, and only when number of agents loaded exceeds _connectedAgentsThreshold
                        if (_agentLBEnabled && !_agentLbHappened) {
                            SearchCriteriaService<HostVO, HostVO> sc = SearchCriteria2.create(HostVO.class);
                            sc.addAnd(sc.getEntity().getManagementServerId(), Op.NNULL);
                            sc.addAnd(sc.getEntity().getType(), Op.EQ, Host.Type.Routing);
                            List<HostVO> allManagedRoutingAgents = sc.list();
                           
                            sc = SearchCriteria2.create(HostVO.class);
                            sc.addAnd(sc.getEntity().getType(), Op.EQ, Host.Type.Routing);
                            List<HostVO> allAgents = sc.list();
                            double allHostsCount = allAgents.size();
                            double managedHostsCount = allManagedRoutingAgents.size();
                            if (allHostsCount > 0.0) {
                                double load = managedHostsCount/allHostsCount;
                                if (load >= _connectedAgentsThreshold) {
                                    s_logger.debug("Scheduling agent rebalancing task as the average agent load " + load + " is more than the threshold " + _connectedAgentsThreshold);
                                    _rebalanceService.scheduleRebalanceAgents();
                                    _agentLbHappened = true;
                                } else {
                                    s_logger.trace("Not scheduling agent rebalancing task as the averages load " + load + " is less than the threshold " + _connectedAgentsThreshold);
                                }
                            }
                        }
                        profilerAgentLB.stop();
                    } finally {
                        profiler.stop();
                       
                        if(profiler.getDuration() >= _heartbeatInterval) {
                            if(s_logger.isDebugEnabled())
                                s_logger.debug("Management server heartbeat takes too long to finish. profiler: " + profiler.toString() +
                                    ", profilerHeartbeatUpdate: " + profilerHeartbeatUpdate.toString() +
                                    ", profilerPeerScan: " + profilerPeerScan.toString() +
                                    ", profilerAgentLB: " + profilerAgentLB.toString());
                        }
                    }
                   
                } catch(CloudRuntimeException e) {
                    s_logger.error("Runtime DB exception ", e.getCause());

                    if(e.getCause() instanceof ClusterInvalidSessionException) {
                        s_logger.error("Invalid cluster session found, fence it");
                        queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
                    }

                    if(isRootCauseConnectionRelated(e.getCause())) {
                        s_logger.error("DB communication problem detected, fence it");
                        queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
                    }

                    invalidHeartbeatConnection();
                } catch(ActiveFencingException e) {
                    queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
                } catch (Throwable e) {
                    s_logger.error("Unexpected exception in cluster heartbeat", e);
                    if(isRootCauseConnectionRelated(e.getCause())) {
                        s_logger.error("DB communication problem detected, fence it");
                        queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
                    }

                    invalidHeartbeatConnection();
                } finally {
                    txn.close("ClusterHeartBeat");
                }
            }
        };
    }

    private boolean isRootCauseConnectionRelated(Throwable e) {
        while (e != null) {
            if (e instanceof SQLRecoverableException) {
                return true;
            }

            e = e.getCause();
        }

        return false;
    }

    private Connection getHeartbeatConnection() throws SQLException {
        if(_heartbeatConnection == null) {
            Connection conn = Transaction.getStandaloneConnectionWithException();
            _heartbeatConnection = new ConnectionConcierge("ClusterManagerHeartBeat", conn, false);
        }

        return _heartbeatConnection.conn();
    }

    private void invalidHeartbeatConnection() {
        if(_heartbeatConnection != null) {
            Connection conn = Transaction.getStandaloneConnection();
            if (conn != null) {
                _heartbeatConnection.reset(Transaction.getStandaloneConnection());
            }
        }
    }

    private Runnable getNotificationTask() {
        return new Runnable() {
            @Override
            public void run() {
                while(true) {
                    synchronized(_notificationMsgs) {
                        try {
                            _notificationMsgs.wait(1000);
                        } catch (InterruptedException e) {
                        }
                    }

                    ClusterManagerMessage msg = null;
                    while((msg = getNextNotificationMessage()) != null) {
                        try {
                            switch(msg.getMessageType()) {
                            case nodeAdded:
                                if(msg.getNodes() != null && msg.getNodes().size() > 0) {
                                    Profiler profiler = new Profiler();
                                    profiler.start();

                                    notifyNodeJoined(msg.getNodes());

                                    profiler.stop();
                                    if(profiler.getDuration() > 1000) {
                                        if(s_logger.isDebugEnabled()) {
                                            s_logger.debug("Notifying management server join event took " + profiler.getDuration() + " ms");
                                        }
                                    } else {
                                        s_logger.warn("Notifying management server join event took " + profiler.getDuration() + " ms");
                                    }
                                }
                                break;

                            case nodeRemoved:
                                if(msg.getNodes() != null && msg.getNodes().size() > 0) {
                                    Profiler profiler = new Profiler();
                                    profiler.start();

                                    notifyNodeLeft(msg.getNodes());

                                    profiler.stop();
                                    if(profiler.getDuration() > 1000) {
                                        if(s_logger.isDebugEnabled()) {
                                            s_logger.debug("Notifying management server leave event took " + profiler.getDuration() + " ms");
                                        }
                                    } else {
                                        s_logger.warn("Notifying management server leave event took " + profiler.getDuration() + " ms");
                                    }
                                }
                                break;

                            case nodeIsolated:
                                notifyNodeIsolated();
                                break;

                            default :
                                assert(false);
                                break;
                            }

                        } catch (Throwable e) {
                            s_logger.warn("Unexpected exception during cluster notification. ", e);
                        }
                    }

                    try { Thread.sleep(1000); } catch (InterruptedException e) {}
                }
            }
        };
    }

    private void queueNotification(ClusterManagerMessage msg) {
        synchronized(this._notificationMsgs) {
            this._notificationMsgs.add(msg);
            this._notificationMsgs.notifyAll();
        }
       
        switch(msg.getMessageType()) {
        case nodeAdded:
            {
                List<ManagementServerHostVO> l = msg.getNodes();
                if(l != null && l.size() > 0) {
                    for(ManagementServerHostVO mshost: l) {
                        _mshostPeerDao.updatePeerInfo(_mshostId, mshost.getId(), mshost.getRunid(), ManagementServerHost.State.Up);
                    }
                }
            }
            break;
           
        case nodeRemoved:
            {
                List<ManagementServerHostVO> l = msg.getNodes();
                if(l != null && l.size() > 0) {
                    for(ManagementServerHostVO mshost: l) {
                        _mshostPeerDao.updatePeerInfo(_mshostId, mshost.getId(), mshost.getRunid(), ManagementServerHost.State.Down);
                    }
                }
            }
            break;
           
        default :
            break;
       
        }
    }

    private ClusterManagerMessage getNextNotificationMessage() {
        synchronized(this._notificationMsgs) {
            if(this._notificationMsgs.size() > 0) {
                return this._notificationMsgs.remove(0);
            }
        }

        return null;
    }

    private void initPeerScan() {
        // upon startup, for all inactive management server nodes that we see at startup time, we will send notification also to help upper layer perform
        // missed cleanup
        Date cutTime = DateUtil.currentGMTTime();
        List<ManagementServerHostVO> inactiveList = _mshostDao.getInactiveList(new Date(cutTime.getTime() - _heartbeatThreshold));
      
        // We don't have foreign key constraints to enforce the mgmt_server_id integrity in host table, when user manually
        // remove records from mshost table, this will leave orphan mgmt_serve_id reference in host table.
        List<Long> orphanList = _mshostDao.listOrphanMsids();
        if(orphanList.size() > 0) {
          for(Long orphanMsid : orphanList) {
            // construct fake ManagementServerHostVO based on orphan MSID
            s_logger.info("Add orphan management server msid found in host table to initial clustering notification, orphan msid: " + orphanMsid);
            inactiveList.add(new ManagementServerHostVO(orphanMsid, 0, "orphan", 0, new Date()));
          }
        } else {
          s_logger.info("We are good, no orphan management server msid in host table is found");
        }
       
        if(inactiveList.size() > 0) {
          if(s_logger.isInfoEnabled()) {
            s_logger.info("Found " + inactiveList.size() + " inactive management server node based on timestamp");
            for(ManagementServerHostVO host : inactiveList)
              s_logger.info("management server node msid: " + host.getMsid() + ", name: " + host.getName() + ", service ip: " + host.getServiceIP() + ", version: " + host.getVersion());
          }

          List<ManagementServerHostVO> downHostList = new ArrayList<ManagementServerHostVO>();
            for(ManagementServerHostVO host : inactiveList) {
              if(!pingManagementNode(host)) {
                  s_logger.warn("Management node " + host.getId() + " is detected inactive by timestamp and also not pingable");
                  downHostList.add(host)
              }
            }
           
            if(downHostList.size() > 0)
              this.queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeRemoved, downHostList));
        } else {
          s_logger.info("No inactive management server node found");
        }
    }

    private void peerScan() throws ActiveFencingException {
        Date cutTime = DateUtil.currentGMTTime();

        Profiler profiler = new Profiler();
        profiler.start();
       
        Profiler profilerQueryActiveList = new Profiler();
        profilerQueryActiveList.start();
        List<ManagementServerHostVO> currentList = _mshostDao.getActiveList(new Date(cutTime.getTime() - _heartbeatThreshold));
        profilerQueryActiveList.stop();

        Profiler profilerSyncClusterInfo = new Profiler();
        profilerSyncClusterInfo.start();
        List<ManagementServerHostVO> removedNodeList = new ArrayList<ManagementServerHostVO>();
        List<ManagementServerHostVO> invalidatedNodeList = new ArrayList<ManagementServerHostVO>();

        if(_mshostId != null) {
           
            if(_mshostPeerDao.countStateSeenInPeers(_mshostId, _runId, ManagementServerHost.State.Down) > 0) {
                String msg = "We have detected that at least one management server peer reports that this management server is down, perform active fencing to avoid split-brain situation";
                s_logger.error(msg);
                throw new ActiveFencingException(msg);
            }
           
            // only if we have already attached to cluster, will we start to check leaving nodes
            for(Map.Entry<Long, ManagementServerHostVO>  entry : _activePeers.entrySet()) {

                ManagementServerHostVO current = getInListById(entry.getKey(), currentList);
                if(current == null) {
                    if(entry.getKey().longValue() != _mshostId.longValue()) {
                        if(s_logger.isDebugEnabled()) {
                            s_logger.debug("Detected management node left, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
                        }
                        removedNodeList.add(entry.getValue());
                    }
                } else {
                    if(current.getRunid() == 0) {
                        if(entry.getKey().longValue() != _mshostId.longValue()) {
                            if(s_logger.isDebugEnabled()) {
                                s_logger.debug("Detected management node left because of invalidated session, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
                            }
                            invalidatedNodeList.add(entry.getValue());
                        }
                    } else {
                        if(entry.getValue().getRunid() != current.getRunid()) {
                            if(s_logger.isDebugEnabled()) {
                                s_logger.debug("Detected management node left and rejoined quickly, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
                            }

                            entry.getValue().setRunid(current.getRunid());
                        }
                    }
                }
            }
        }
        profilerSyncClusterInfo.stop();
       
        Profiler profilerInvalidatedNodeList = new Profiler();
        profilerInvalidatedNodeList.start();
        // process invalidated node list
        if(invalidatedNodeList.size() > 0) {
            for(ManagementServerHostVO mshost : invalidatedNodeList) {
                _activePeers.remove(mshost.getId());
                try {
                    JmxUtil.unregisterMBean("ClusterManager", "Node " + mshost.getId());
                } catch(Exception e) {
                    s_logger.warn("Unable to deregiester cluster node from JMX monitoring due to exception " + e.toString());
                }
            }

            this.queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeRemoved, invalidatedNodeList));
        }
        profilerInvalidatedNodeList.stop();

        Profiler profilerRemovedList = new Profiler();
        profilerRemovedList.start();
        // process removed node list
        Iterator<ManagementServerHostVO> it = removedNodeList.iterator();
        while(it.hasNext()) {
            ManagementServerHostVO mshost = it.next();
            if(!pingManagementNode(mshost)) {
                s_logger.warn("Management node " + mshost.getId() + " is detected inactive by timestamp and also not pingable");
                _activePeers.remove(mshost.getId());
                try {
                    JmxUtil.unregisterMBean("ClusterManager", "Node " + mshost.getId());
                } catch(Exception e) {
                    s_logger.warn("Unable to deregiester cluster node from JMX monitoring due to exception " + e.toString());
                }
            } else {
                s_logger.info("Management node " + mshost.getId() + " is detected inactive by timestamp but is pingable");
                it.remove();
            }
        }

        if(removedNodeList.size() > 0) {
            this.queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeRemoved, removedNodeList));
        }
        profilerRemovedList.stop();

        List<ManagementServerHostVO> newNodeList = new ArrayList<ManagementServerHostVO>();
        for(ManagementServerHostVO mshost : currentList) {
            if(!_activePeers.containsKey(mshost.getId())) {
                _activePeers.put(mshost.getId(), mshost);

                if(s_logger.isDebugEnabled()) {
                    s_logger.debug("Detected management node joined, id:" + mshost.getId() + ", nodeIP:" + mshost.getServiceIP());
                }
                newNodeList.add(mshost);

                try {
                    JmxUtil.registerMBean("ClusterManager", "Node " + mshost.getId(), new ClusterManagerMBeanImpl(this, mshost));
                } catch(Exception e) {
                    s_logger.warn("Unable to regiester cluster node into JMX monitoring due to exception " + ExceptionUtil.toString(e));
                }
            }
        }

        if(newNodeList.size() > 0) {
            this.queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeAdded, newNodeList));
        }
       
        profiler.stop();
       
        if(profiler.getDuration() >= this._heartbeatInterval) {
            if(s_logger.isDebugEnabled())
                s_logger.debug("Peer scan takes too long to finish. profiler: " + profiler.toString()
                  + ", profilerQueryActiveList: " + profilerQueryActiveList.toString()
                  + ", profilerSyncClusterInfo: " + profilerSyncClusterInfo.toString()
                  + ", profilerInvalidatedNodeList: " + profilerInvalidatedNodeList.toString()
                  + ", profilerRemovedList: " + profilerRemovedList.toString());
        }
    }

    private static ManagementServerHostVO getInListById(Long id, List<ManagementServerHostVO> l) {
        for(ManagementServerHostVO mshost : l) {
            if(mshost.getId() == id) {
                return mshost;
            }
        }
        return null;
    }

    @Override
    public String getName() {
        return _name;
    }

    @Override @DB
    public boolean start() {
        if(s_logger.isInfoEnabled()) {
            s_logger.info("Starting cluster manager, msid : " + _msId);
        }

        Transaction txn = Transaction.currentTxn();
        try {
            txn.start();

            final Class<?> c = this.getClass();
            String version = c.getPackage().getImplementationVersion();

            ManagementServerHostVO mshost = _mshostDao.findByMsid(_msId);
            if (mshost == null) {
                mshost = new ManagementServerHostVO();
                mshost.setMsid(_msId);
                mshost.setRunid(this.getCurrentRunId());
                mshost.setName(NetUtils.getHostName());
                mshost.setVersion(version);
                mshost.setServiceIP(_clusterNodeIP);
                mshost.setServicePort(_currentServiceAdapter.getServicePort());
                mshost.setLastUpdateTime(DateUtil.currentGMTTime());
                mshost.setRemoved(null);
                mshost.setAlertCount(0);
                mshost.setState(ManagementServerHost.State.Up);
                _mshostDao.persist(mshost);

                if (s_logger.isInfoEnabled()) {
                    s_logger.info("New instance of management server msid " + _msId + " is being started");
                }
            } else {
                if (s_logger.isInfoEnabled()) {
                    s_logger.info("Management server " + _msId + " is being started");
                }

                _mshostDao.update(mshost.getId(), getCurrentRunId(), NetUtils.getHostName(), version, _clusterNodeIP, _currentServiceAdapter.getServicePort(), DateUtil.currentGMTTime());
            }

            txn.commit();

            _mshostId = mshost.getId();
            if (s_logger.isInfoEnabled()) {
                s_logger.info("Management server (host id : " + _mshostId + ") is being started at " + _clusterNodeIP + ":" + _currentServiceAdapter.getServicePort());
            }
           
            _mshostPeerDao.clearPeerInfo(_mshostId);

            // use seperate thread for heartbeat updates
            _heartbeatScheduler.scheduleAtFixedRate(getHeartbeatTask(), _heartbeatInterval, _heartbeatInterval, TimeUnit.MILLISECONDS);
            _notificationExecutor.submit(getNotificationTask());

        } catch (Throwable e) {
            s_logger.error("Unexpected exception : ", e);
            txn.rollback();

            throw new CloudRuntimeException("Unable to initialize cluster info into database");
        }

        if (s_logger.isInfoEnabled()) {
            s_logger.info("Cluster manager was started successfully");
        }

        return true;
    }

    @Override @DB
    public boolean stop() {
        if(_mshostId != null) {
            ManagementServerHostVO mshost = _mshostDao.findByMsid(_msId);
            mshost.setState(ManagementServerHost.State.Down);
            _mshostDao.update(_mshostId, mshost);
        }

        _heartbeatScheduler.shutdownNow();
        _executor.shutdownNow();

        try {
            _heartbeatScheduler.awaitTermination(EXECUTOR_SHUTDOWN_TIMEOUT, TimeUnit.MILLISECONDS);
            _executor.awaitTermination(EXECUTOR_SHUTDOWN_TIMEOUT, TimeUnit.MILLISECONDS);
        } catch (InterruptedException e) {
        }

        if(s_logger.isInfoEnabled()) {
            s_logger.info("Cluster manager is stopped");
        }

        return true;
    }

    @Override
    public boolean configure(String name, Map<String, Object> params) throws ConfigurationException {
        if(s_logger.isInfoEnabled()) {
            s_logger.info("Start configuring cluster manager : " + name);
        }
        _name = name;

        ComponentLocator locator = ComponentLocator.getCurrentLocator();
        _agentMgr = locator.getManager(AgentManager.class);
        if (_agentMgr == null) {
            throw new ConfigurationException("Unable to get " + AgentManager.class.getName());
        }

        _mshostDao = locator.getDao(ManagementServerHostDao.class);
        if (_mshostDao == null) {
            throw new ConfigurationException("Unable to get " + ManagementServerHostDao.class.getName());
        }
       
        _mshostPeerDao = locator.getDao(ManagementServerHostPeerDao.class);
        if (_mshostPeerDao == null) {
            throw new ConfigurationException("Unable to get " + ManagementServerHostPeerDao.class.getName());
        }
       
        _hostDao = locator.getDao(HostDao.class);
        if (_hostDao == null) {
            throw new ConfigurationException("Unable to get " + HostDao.class.getName());
        }

        _hostTransferDao = locator.getDao(HostTransferMapDao.class);
        if (_hostTransferDao == null) {
            throw new ConfigurationException("Unable to get agent transfer map dao");
        }

        ConfigurationDao configDao = locator.getDao(ConfigurationDao.class);
        if (configDao == null) {
            throw new ConfigurationException("Unable to get the configuration dao.");
        }

        Map<String, String> configs = configDao.getConfiguration("management-server", params);

        String value = configs.get("cluster.heartbeat.interval");
        if (value != null) {
            _heartbeatInterval = NumbersUtil.parseInt(value, ClusterManager.DEFAULT_HEARTBEAT_INTERVAL);
        }

        value = configs.get("cluster.heartbeat.threshold");
        if (value != null) {
            _heartbeatThreshold = NumbersUtil.parseInt(value, ClusterManager.DEFAULT_HEARTBEAT_THRESHOLD);
        }

        File dbPropsFile = PropertiesUtil.findConfigFile("db.properties");
        Properties dbProps = new Properties();
        try {
            dbProps.load(new FileInputStream(dbPropsFile));
        } catch (FileNotFoundException e) {
            throw new ConfigurationException("Unable to find db.properties");
        } catch (IOException e) {
            throw new ConfigurationException("Unable to load db.properties content");
        }
        _clusterNodeIP = dbProps.getProperty("cluster.node.IP");
        if (_clusterNodeIP == null) {
            _clusterNodeIP = "127.0.0.1";
        }
        _clusterNodeIP = _clusterNodeIP.trim();

        if(s_logger.isInfoEnabled()) {
            s_logger.info("Cluster node IP : " + _clusterNodeIP);
        }

        if(!NetUtils.isLocalAddress(_clusterNodeIP)) {
            throw new ConfigurationException("cluster node IP should be valid local address where the server is running, please check your configuration");
        }

        for(int i = 0; i < DEFAULT_OUTGOING_WORKERS; i++)
          _executor.execute(getClusterPduSendingTask());
       
        // notification task itself in turn works as a task dispatcher
        _executor.execute(getClusterPduNotificationTask());

        Adapters<ClusterServiceAdapter> adapters = locator.getAdapters(ClusterServiceAdapter.class);
        if (adapters == null || !adapters.isSet()) {
            throw new ConfigurationException("Unable to get cluster service adapters");
        }
        Enumeration<ClusterServiceAdapter> it = adapters.enumeration();
        if(it.hasMoreElements()) {
            _currentServiceAdapter = it.nextElement();
        }

        if(_currentServiceAdapter == null) {
            throw new ConfigurationException("Unable to set current cluster service adapter");
        }

        _agentLBEnabled = Boolean.valueOf(configDao.getValue(Config.AgentLbEnable.key()));
       
        String connectedAgentsThreshold = configs.get("agent.load.threshold");
       
        if (connectedAgentsThreshold != null) {
            _connectedAgentsThreshold = Double.parseDouble(connectedAgentsThreshold);
        }

        this.registerListener(new LockMasterListener(_msId));

        checkConflicts();

        if(s_logger.isInfoEnabled()) {
            s_logger.info("Cluster manager is configured.");
        }
        return true;
    }

    @Override
    public long getManagementNodeId() {
        return _msId;
    }

    @Override
    public long getCurrentRunId() {
        return _runId;
    }

    @Override
    public boolean isManagementNodeAlive(long msid) {
        ManagementServerHostVO mshost = _mshostDao.findByMsid(msid);
        if(mshost != null) {
            if(mshost.getLastUpdateTime().getTime() >=  DateUtil.currentGMTTime().getTime() - _heartbeatThreshold) {
                return true;
            }
        }

        return false;
    }

    @Override
    public boolean pingManagementNode(long msid) {
        ManagementServerHostVO mshost = _mshostDao.findByMsid(msid);
        if(mshost == null) {
            return false;
        }

        return pingManagementNode(mshost);
    }

    private boolean pingManagementNode(ManagementServerHostVO mshost) {

        String targetIp = mshost.getServiceIP();
        if("127.0.0.1".equals(targetIp) || "0.0.0.0".equals(targetIp)) {
            s_logger.info("ping management node cluster service can not be performed on self");
            return false;
        }
    
        int retry = 10;
        while (--retry > 0) {
            SocketChannel sch = null;
            try {
                s_logger.info("Trying to connect to " + targetIp);
                sch = SocketChannel.open();
                sch.configureBlocking(true);
                sch.socket().setSoTimeout(5000);

                InetSocketAddress addr = new InetSocketAddress(targetIp, mshost.getServicePort());
                sch.connect(addr);
                return true;
            } catch (IOException e) {
                if (e instanceof ConnectException) {
                    s_logger.error("Unable to ping management server at " + targetIp + ":" + mshost.getServicePort() + " due to ConnectException", e);
                  return false;
                }
            } finally {
                if (sch != null) {
                    try {
                        sch.close();
                    } catch (IOException e) {
                    }
                }
            }

            try {
                Thread.sleep(1000);
            } catch (InterruptedException ex) {
            }
        }
       
        s_logger.error("Unable to ping management server at " + targetIp + ":" + mshost.getServicePort() + " after retries");
        return false;
    }


    @Override
    public int getHeartbeatThreshold() {
        return this._heartbeatThreshold;
    }

    public int getHeartbeatInterval() {
        return this._heartbeatInterval;
    }

    public void setHeartbeatThreshold(int threshold) {
        _heartbeatThreshold = threshold;
    }

    private void checkConflicts() throws ConfigurationException {
        Date cutTime = DateUtil.currentGMTTime();
        List<ManagementServerHostVO> peers = _mshostDao.getActiveList(new Date(cutTime.getTime() - _heartbeatThreshold));
        for(ManagementServerHostVO peer : peers) {
            String peerIP = peer.getServiceIP().trim();
            if(_clusterNodeIP.equals(peerIP)) {
                if("127.0.0.1".equals(_clusterNodeIP)) {
                    if(pingManagementNode(peer.getMsid())) {
                        String msg = "Detected another management node with localhost IP is already running, please check your cluster configuration";
                        s_logger.error(msg);
                        throw new ConfigurationException(msg);
                    } else {
                        String msg = "Detected another management node with localhost IP is considered as running in DB, however it is not pingable, we will continue cluster initialization with this management server node";
                        s_logger.info(msg);
                    }
                } else {
                    if(pingManagementNode(peer.getMsid())) {
                        String msg = "Detected that another management node with the same IP " + peer.getServiceIP() + " is already running, please check your cluster configuration";
                        s_logger.error(msg);
                        throw new ConfigurationException(msg);
                    } else {
                        String msg = "Detected that another management node with the same IP " + peer.getServiceIP()
                                + " is considered as running in DB, however it is not pingable, we will continue cluster initialization with this management server node";
                        s_logger.info(msg);
                    }
                }
            }
        }
    }

    @Override
    public boolean rebalanceAgent(long agentId, Event event, long currentOwnerId, long futureOwnerId) throws AgentUnavailableException, OperationTimedoutException {
        return _rebalanceService.executeRebalanceRequest(agentId, currentOwnerId, futureOwnerId, event);
    }

    @Override
    public  boolean isAgentRebalanceEnabled() {
        return _agentLBEnabled;
    }
   
    @Override
    public Boolean propagateResourceEvent(long agentId, ResourceState.Event event) throws AgentUnavailableException {
        final String msPeer = getPeerName(agentId);
        if (msPeer == null) {
            return null;
        }

        if (s_logger.isDebugEnabled()) {
            s_logger.debug("Propagating agent change request event:" + event.toString() + " to agent:" + agentId);
        }
        Command[] cmds = new Command[1];
        cmds[0] = new PropagateResourceEventCommand(agentId, event);

        Answer[] answers = execute(msPeer, agentId, cmds, true);
        if (answers == null) {
            throw new AgentUnavailableException(agentId);
        }

        if (s_logger.isDebugEnabled()) {
            s_logger.debug("Result for agent change is " + answers[0].getResult());
        }

        return answers[0].getResult();
    }
   
    @Override
    public boolean executeResourceUserRequest(long hostId, ResourceState.Event event) throws AgentUnavailableException {
        return _resourceMgr.executeUserRequest(hostId, event);
    }
}
TOP

Related Classes of com.cloud.cluster.ClusterManagerImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.