Package com.linkedin.databus.cluster

Source Code of com.linkedin.databus.cluster.DatabusCluster$DatabusClusterException

package com.linkedin.databus.cluster;

/*
*
* Copyright 2013 LinkedIn Corp. All rights reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/

import java.net.InetAddress;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Vector;

import org.apache.helix.ExternalViewChangeListener;
import org.apache.helix.HelixException;
import org.apache.helix.HelixManager;
import org.apache.helix.HelixManagerFactory;
import org.apache.helix.InstanceType;
import org.apache.helix.LiveInstanceChangeListener;
import org.apache.helix.NotificationContext;
import org.apache.helix.controller.HelixControllerMain;
import org.apache.helix.manager.zk.ZKHelixAdmin;
import org.apache.helix.manager.zk.ZNRecordSerializer;
import org.apache.helix.manager.zk.ZkClient;
import org.apache.helix.model.ExternalView;
import org.apache.helix.model.IdealState;
import org.apache.helix.model.IdealState.IdealStateModeProperty;
import org.apache.helix.model.InstanceConfig;
import org.apache.helix.model.LiveInstance;
import org.apache.helix.model.StateModelDefinition;
import org.apache.helix.participant.StateMachineEngine;
import org.apache.helix.tools.StateModelConfigGenerator;
import org.apache.log4j.Logger;

import com.linkedin.databus.client.registration.ClusterRegistrationStaticConfig;

public class DatabusCluster
{
    public static final String MODULE = DatabusCluster.class.getName();
    public static final Logger LOG = Logger.getLogger(MODULE);

    public static final String DEFAULT_STATE_MODEL = "OnlineOffline";
    public static final String DEFAULT_RESOURCE_NAME = "default-resource";
    //default time in ms that we wait for cluster to be setup
    public static final int DEFAULT_CLUSTER_CREATE_WAIT_MS = 1000;

    private static final String HELIX_MANAGER_ZK_SESSION_TIMEOUT_KEY = "zk.session.timeout";

    protected final ZkClient _zkClient;
    protected final ZKHelixAdmin _admin;
    protected final String _clusterName;
    protected final String _zkAddr;
    protected final int _quorum;
    protected final int _numPartitions;
    protected final HashSet<DatabusClusterDataNotifier> _dataNotifiers;

    // populated by external watcher;
    protected final DatabusHelixWatcher _watcher;

    private final int _zkConnectionTimeoutMs;
    private final int _zkSessionTimeoutMs;

    public DatabusCluster(ClusterRegistrationStaticConfig config) throws Exception
    {
        _zkAddr = config.getZkAddr();
        _clusterName = config.getClusterName();
        _quorum = (int)(config.getQuorum());
        _numPartitions = (int)(config.getNumPartitions());

        _zkSessionTimeoutMs = config.getZkSessionTimeoutMs();
        _zkConnectionTimeoutMs = config.getZkConnectionTimeoutMs();

        updateHelixManagerZkSessionTimeout(_zkSessionTimeoutMs);

        _zkClient = new ZkClient(_zkAddr, _zkSessionTimeoutMs, _zkConnectionTimeoutMs, new ZNRecordSerializer());
        _admin = new ZKHelixAdmin(_zkClient);
        _dataNotifiers = new HashSet<DatabusClusterDataNotifier>(5);

        //attempt to create a cluster
        int part = create(_admin, _zkClient, _clusterName, _numPartitions);

        //at this stage a cluster and resources should have been created, either by this instance or someone else
        if (part >= 0)
        {
            if (_numPartitions != part)
            {
                String msg = "Cannot create DatabusCluster!  Cluster exists with num partitions="
                        + part
                        + ". Tried to join with "
                        + _numPartitions
                        + " partitions";
                throw new DatabusClusterException(msg);
            }
        }
        else
        {
            throw new DatabusClusterException("Cluster " + _clusterName + " could not be accessed. Num partitions returned -1");
        }

        // initialize watcher after creating a cluster
        _watcher = new DatabusHelixWatcher();
    }

    /**
     * Updates the zk.session.timeout system property for ZK connections made by the Helix manager
     */
    private static void updateHelixManagerZkSessionTimeout(int timeoutMs)
    {
      String timeoutStr = System.getProperty(HELIX_MANAGER_ZK_SESSION_TIMEOUT_KEY);
      if (null != timeoutStr)
      {
        try
        {
          int envTimeoutMs = Integer.parseInt(timeoutStr);
          if (envTimeoutMs >= timeoutMs)
          {
            //the existing timeout is larger than ours, so keep as it is
            return;
          }
        }
        catch (NumberFormatException e)
        {
          LOG.warn("invalid existing value for " + HELIX_MANAGER_ZK_SESSION_TIMEOUT_KEY + ": " + timeoutStr);
        }
      }

      System.setProperty(HELIX_MANAGER_ZK_SESSION_TIMEOUT_KEY, Integer.toString(timeoutMs));
    }

    /**
     *
     * @return number of partitions in this resource; called mainly to do sanity
     *         check - if requested number of partitions is same as existing
     *         partition If this can't be determined;return -1 To resize, use a
     *         new cluster name
     */
    static protected int getNumPartitionsInResource(ZKHelixAdmin admin, String clusterName,String resourceName)
    {
        if (admin != null)
        {
            try
            {
                IdealState idealState = admin.getResourceIdealState(clusterName,
                        resourceName);
                if (idealState != null)
                {
                    return idealState.getNumPartitions();
                }
                else
                {
                    return 0;
                }
            }
            catch (Exception e)
            {
                LOG.warn("Resource " + resourceName + " not found in " + clusterName);
                return 0;
            }
        }
        return -1;
    }

    /**
     * create a cluster with a partitioned resource .
     * If successful, a non-zero number indicating the number of partitions created in the cluster is returned ;
     * This is meant to be atomic. If more than one thread/instance attempts to create the same cluster with diff number of partitions
     * only one of them will win . If a cluster exists, the number returned however will be the same in both those instances.
     * If the cluster could not be reached 0 is returned (retry possible) , and if there are other errors -1 is returned (retry not possible)
     */
    static public int create(ZKHelixAdmin admin, ZkClient zkClient, String clusterName,
            int numPartitions)
    {
        boolean clusterAdded = true;
        // add cluster
        try
        {
          /**
           *  TODO : HACK !! : Copying this logic from OLD Helix library to mimic similar old "behavior".
           *
           *  Please see DDSDBUS-2579/HELIX-137
           *  The helix addCluster() ( in 0.6.2.3) API  has a new problem  where callers could not differentiate
           *  between the case when new cluster is created and the case where it was created by some other client. This was needed
           *  so that the follow-up steps of adding state-model (non-idempotent operation) can be done only by the client creating the cluster.
           *  Both old (0.6.1.3) and new Helix (0.6.2.3 ) library has the following issue:
           *   (a)  "No Atomicity in the face of the ZK client disconnects which results in cluster only partly
           *       initialized and unusable. This is noticed in PCL/PCS environment"
           *
           *  In order to workaround the backwards incompatibility issue between the 2 helix versions, we are reproducing part
           *  of the old addCluster() implementation below to get the same behavior as that of using 0.6.1.3 . The problem referred
           *  as (a) still exists.
           *
           */
          if (zkClient.exists("/" + clusterName))
          {
            throw new Exception("Cluster already exists !!");
          }

          clusterAdded = admin.addCluster(clusterName, false);

          if ( ! clusterAdded )
          {
            LOG.error("Problem creating cluster (" + clusterName + ")");
          }
        }
        catch (Exception e)
        {
            LOG.warn("Warn! Cluster might already exist! " + clusterName + " Exception="  + e.getMessage());
            clusterAdded = false;
        }

        if (clusterAdded)
        {
             //LOG.warn("Added new cluster " + clusterName
             //      + " . Creating resource " + DEFAULT_RESOURCE_NAME);
            // add state model definition
            try
            {
                admin.addStateModelDef(
                        clusterName,
                        DEFAULT_STATE_MODEL,
                        new StateModelDefinition(StateModelConfigGenerator
                                .generateConfigForOnlineOffline()));

                admin.addResource(clusterName, DEFAULT_RESOURCE_NAME,
                        numPartitions, DEFAULT_STATE_MODEL,
                        IdealStateModeProperty.AUTO_REBALANCE.toString());

                admin.rebalance(clusterName, DEFAULT_RESOURCE_NAME, 1);
            }
            catch (Exception e)
            {
                LOG.warn("Resource addition incomplete. May have been completed by another instance: " + e.getMessage());
                clusterAdded = false;
            }
        }

        //Ensure that cluster is setup fully
        int part = getNumPartitionsInResource(admin, clusterName, DEFAULT_RESOURCE_NAME);
        if (part == 0)
        {
            long startTimeMs = System.currentTimeMillis();
            try
            {
                do
                {
                    Thread.sleep(100);
                    part = getNumPartitionsInResource(admin, clusterName, DEFAULT_RESOURCE_NAME);
                }
                while (part==0 && ((System.currentTimeMillis()-startTimeMs) < DEFAULT_CLUSTER_CREATE_WAIT_MS) );
            }
            catch (InterruptedException e)
            {
                LOG.warn("Cluster create wait interrupted for cluster=" + clusterName + " exception= " + e.getMessage());
            }
        }
        return part;
    }


    public void start()
    {

        if (_watcher != null)
        {
            _watcher.start();
        }
    }

    public void shutdown()
    {
        if (_watcher != null)
        {
            _watcher.stop();
        }
        if (_dataNotifiers != null)
        {
            _dataNotifiers.clear();
        }
        if (_zkClient != null)
        {
            _zkClient.close();
        }
    }

    public DatabusClusterMember addMember(String id)
    {
        return addMember(id, null);
    }

    public DatabusClusterMember addMember(String id,
            DatabusClusterNotifier notifier)
    {
        try
        {
            if (_admin != null)
            {
                InstanceConfig config = null;
                try
                {
                    config = _admin.getInstanceConfig(_clusterName, id);
                }
                catch (HelixException e)
                {
                    // the instance doesn't exist , so adding a new config
                }
                if (config != null)
                {
                    LOG.warn("Member id already exists! Overwriting instance for id="
                            + id);
                    _admin.dropInstance(_clusterName, config);
                    config = null;
                }
                config = new InstanceConfig(id);
                config.setHostName(InetAddress.getLocalHost()
                        .getCanonicalHostName());
                config.setInstanceEnabled(true);
                _admin.addInstance(_clusterName, config);
                return new DatabusClusterMember(id, notifier);
            }
        }
        catch (Exception e)
        {
            LOG.error("Error creating databus cluster member " + id
                    + " exception:" + e);
        }
        return null;
    }

    /**
     * Add a cluster data notifier to the cluster for notifications on cluster
     * metadata
     */
    synchronized public void addDataNotifier(DatabusClusterDataNotifier notifier)
    {
        if (notifier != null)
        {
            _dataNotifiers.add(notifier);
        }
        else
        {
            LOG.warn("Add failed. Attempting to add null DatabusClusterDataNotifier!");
        }
    }

    synchronized public void removeDataNotifier(
            DatabusClusterDataNotifier notifier)
    {
        if (notifier != null)
        {
            _dataNotifiers.remove(notifier);
        }
    }

    public int getNumPartitions()
    {
        return _numPartitions;
    }

    public int getNumActiveMembers()
    {
        return _watcher.getNumActiveInstances();
    }

    public int getNumActivePartitions()
    {
        return _watcher.getNumActivePartitions();
    }

    public HashMap<Integer, String> getActivePartitions()
    {
        return _watcher.getPartitions();
    }

    public String getPartitionOwner(int partition)
    {
        return _watcher.getOwnerId(partition);
    }

    public String getClusterName()
    {
        return _clusterName;
    }

    public int getQuorum()
    {
        return _quorum;
    }

    /**
     * A cluster member who has the ability to join and leave the cluster
     **/
    public class DatabusClusterMember
    {

        private final HelixManager _manager;
        private final String _id;
        private DatabusClusterNotifier _notifier = null;

        DatabusClusterMember(String id, DatabusClusterNotifier notifier)
                throws Exception
        {
            _id = id;
            _manager = HelixManagerFactory.getZKHelixManager(_clusterName, _id,
                    InstanceType.PARTICIPANT, _zkAddr);
            _notifier = notifier;
            registerNotifier();
        }

        void registerNotifier()
        {
            StateMachineEngine stateMach = _manager.getStateMachineEngine();
            DatabusClusterNotifierFactory modelFactory = new DatabusClusterNotifierFactory(
                    _notifier);
            stateMach.registerStateModelFactory(DEFAULT_STATE_MODEL,
                    modelFactory);
        }

        public boolean join()
        {
            if (_manager != null)
            {
                if (!_manager.isConnected())
                {
                    try
                    {
                        _manager.connect();
                        return true;
                    }
                    catch (Exception e)
                    {
                        LOG.error("Member " + _id + " could not connect! " + e);
                    }
                }
                else
                {
                    LOG.warn("Member " + _id + " cannot join. Already joined! ");
                    return true;
                }
            }
            return false;
        }

        public boolean leave()
        {
            if (_manager != null)
            {
                try
                {
                    _manager.disconnect();
                }
                catch (Exception e)
                {
                    LOG.error("Member " + _id + " could not disconnect! " + e);
                }
            }
            // ensure that whitelisted instance configs go away
            if (_admin != null)
            {
                try
                {
                    _admin.dropInstance(_clusterName,
                            _admin.getInstanceConfig(_clusterName, _id));
                }
                catch (HelixException e)
                {
                    LOG.warn("Drop instance failed for id= " + _id
                            + " exception" + e);
                }
            }
            return true;
        }

        /** Used for unit-testing **/
        protected DatabusClusterMember()
        {
            _manager = null;
            _id = null;
        }

    }

    @SuppressWarnings("serial")
    static public class DatabusClusterException extends Exception
    {
        public DatabusClusterException(String msg)
        {
            super(msg);
        }

    }

    private class DatabusHelixWatcher implements LiveInstanceChangeListener,
            ExternalViewChangeListener
    {

        final private HelixManager _manager;
        private int _numActiveInstances = -1;
        final private Random _random = new Random(System.currentTimeMillis());
        final private HashMap<Integer, String> _partitionMap;

        // State to help control helix assignment - enable/disable cluster
        private HelixManager _helixManager = null;
        // has Cluster been paused?
        private boolean _paused = false;
        private final int _id;

        public DatabusHelixWatcher() throws Exception
        {

            _id = _random.nextInt();
            _manager = HelixManagerFactory.getZKHelixManager(_clusterName,
                    "watcher_" + _clusterName + "_" + _id,
                    InstanceType.SPECTATOR, _zkAddr);
            _partitionMap = new HashMap<Integer, String>(_numPartitions);

        }

        public void start()
        {
            if (_manager != null)
            {
                try
                {
                    if (!_manager.isConnected())
                    {
                        _manager.connect();
                        _manager.addLiveInstanceChangeListener(this);
                        _manager.addExternalViewChangeListener(this);
                    }
                }
                catch (Exception e)
                {
                    LOG.error("Cannot start HelixWatcher! " + e);
                }
            }
        }

        public void stop()
        {
            if (_manager != null)
            {
                _manager.disconnect();
            }
            stopHelixController();
        }

        @Override
        public synchronized void onLiveInstanceChange(
                List<LiveInstance> liveInstances,
                NotificationContext changeContext)
        {
            _numActiveInstances = liveInstances.size();
            boolean quorumReached = (_numActiveInstances >= _quorum);
            if (quorumReached)
            {
                if (_helixManager == null)
                {
                    LOG.warn("Quorum Reached! numNodes=" + _numActiveInstances
                            + " quorum=" + _quorum);
                    // controller needs to be started
                    startHelixController();
                }
                else if (_paused)
                {
                    resumeHelixController();
                    _paused = false;
                }
            }
            else
            {
                LOG.warn("Number of nodes inadequate=" + _numActiveInstances
                        + " Need at least:" + _quorum);
                if (_helixManager != null && !_paused)
                {
                    // controller has started; but pauseCluster
                    pauseHelixController();
                    _paused = true;
                }
            }

            // perform user-specified callback
            if (!_dataNotifiers.isEmpty())
            {
                Vector<String> nodeList = new Vector<String>(
                        liveInstances.size());
                for (LiveInstance i : liveInstances)
                {
                    nodeList.add(i.getInstanceName());
                }
                for (DatabusClusterDataNotifier notifier : _dataNotifiers)
                {
                    notifier.onInstanceChange(nodeList);
                }
            }
        }

        private Integer getPartition(String partition)
        {
            String[] ps = partition.split("_");
            if (ps.length >= 2)
            {
                return Integer.parseInt(ps[ps.length - 1]);
            }
            return -1;
        }

        @Override
        public synchronized void onExternalViewChange(
                List<ExternalView> externalViewList,
                NotificationContext changeContext)
        {
            _partitionMap.clear();
            for (ExternalView v : externalViewList)
            {
                if (v.getResourceName().equals(DEFAULT_RESOURCE_NAME))
                {
                    for (String k : v.getPartitionSet())
                    {
                        Map<String, String> map = v.getStateMap(k);
                        if (map != null)
                        {
                            for (Map.Entry<String, String> mkPair : map
                                    .entrySet())
                            {
                                String value = mkPair.getValue();
                                if (value != null)
                                {
                                    Integer partition = getPartition(k);
                                    if (value.equals("ONLINE"))
                                    {
                                        _partitionMap.put(partition,
                                                mkPair.getKey());
                                    }
                                }
                            }
                        }
                    }
                }
            }

            // external call
            if (!_dataNotifiers.isEmpty())
            {
                HashMap<Integer, String> pmap = getPartitions();
                for (DatabusClusterDataNotifier notifier : _dataNotifiers)
                {
                    notifier.onPartitionMappingChange(pmap);
                }
            }
        }

        synchronized public int getNumActiveInstances()
        {
            return _numActiveInstances;
        }

        synchronized public int getNumActivePartitions()
        {
            return _partitionMap.size();
        }

        synchronized public String getOwnerId(int numPartition)
        {
            return _partitionMap.get(numPartition);
        }

        synchronized public HashMap<Integer, String> getPartitions()
        {
            HashMap<Integer, String> map = new HashMap<Integer, String>(
                    _partitionMap.size());
            map.putAll(_partitionMap);
            return map;
        }

        /** methods to control helix's assignment of partitions **/

        void stopHelixController()
        {
            if (_helixManager != null)
            {
                LOG.warn("Shutting down cluster : "
                        + _helixManager.getClusterName() + " instance:"
                        + _helixManager.getInstanceName());
                _helixManager.disconnect();
            }
        }

        void pauseHelixController()
        {
            if (_admin != null)
            {
                LOG.warn("Pausing  cluster : " + _clusterName);
                _admin.enableCluster(_clusterName, false);
            }
        }

        void resumeHelixController()
        {
            if (_admin != null)
            {
                LOG.warn("Resuming  cluster : " + _clusterName);
                _admin.enableCluster(_clusterName, true);
            }
        }

        void startHelixController()
        {
            try
            {
                String controllerId = "controller_" + _id;
                LOG.info("Starting cluster controller for cluster="
                        + _clusterName + " with id = " + controllerId);
                _helixManager = HelixControllerMain.startHelixController(
                        _zkAddr, _clusterName, controllerId,
                        HelixControllerMain.STANDALONE);
                if (_admin != null)
                {
                    _admin.enableCluster(_helixManager.getClusterName(), true);
                }
            }
            catch (Exception e)
            {
                LOG.error("Cannot start cluster controller for cluster="
                        + _clusterName + e);
            }
        }

    }

    /**
     * Used only for unit-testing
     */
    protected DatabusCluster()
    {
        _admin = null;
        _zkAddr = null;
        _watcher = null;
        _zkClient = null;
        _clusterName = null;
        _quorum = 0;
        _numPartitions = 0;
        _dataNotifiers = null;
        _zkConnectionTimeoutMs = ZkClient.DEFAULT_CONNECTION_TIMEOUT;
        _zkSessionTimeoutMs = ZkClient.DEFAULT_SESSION_TIMEOUT;
    }

  @Override
  public String toString() {
    return "DatabusCluster [_zkClient=" + _zkClient + ", _admin=" + _admin
        + ", _clusterName=" + _clusterName + ", _zkAddr=" + _zkAddr
        + ", _quorum=" + _quorum + ", _numPartitions=" + _numPartitions
        + ", _dataNotifiers=" + _dataNotifiers + ", _watcher="
        + _watcher + ", _zkConnectionTimeoutMs="
        + _zkConnectionTimeoutMs + ", _zkSessionTimeoutMs="
        + _zkSessionTimeoutMs + "]";
  }
}
TOP

Related Classes of com.linkedin.databus.cluster.DatabusCluster$DatabusClusterException

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.