Package com.taobao.top.analysis.node.component

Source Code of com.taobao.top.analysis.node.component.JobManager

/**
*
*/
package com.taobao.top.analysis.node.component;

import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.zookeeper.ZooKeeper;
import org.jboss.netty.channel.Channel;

import com.taobao.top.analysis.config.MasterConfig;
import com.taobao.top.analysis.exception.AnalysisException;
import com.taobao.top.analysis.node.IJobBuilder;
import com.taobao.top.analysis.node.IJobExporter;
import com.taobao.top.analysis.node.IJobManager;
import com.taobao.top.analysis.node.IJobResultMerger;
import com.taobao.top.analysis.node.event.GetTaskRequestEvent;
import com.taobao.top.analysis.node.event.SendResultsRequestEvent;
import com.taobao.top.analysis.node.job.Job;
import com.taobao.top.analysis.node.job.JobMergedResult;
import com.taobao.top.analysis.node.job.JobTask;
import com.taobao.top.analysis.node.job.JobTaskExecuteInfo;
import com.taobao.top.analysis.node.job.JobTaskResult;
import com.taobao.top.analysis.node.job.JobTaskStatus;
import com.taobao.top.analysis.node.operation.JobDataOperation;
import com.taobao.top.analysis.node.operation.MergeJobOperation;
import com.taobao.top.analysis.util.AnalysisConstants;
import com.taobao.top.analysis.util.AnalyzerZKWatcher;
import com.taobao.top.analysis.util.MasterDataRecoverWorker;
import com.taobao.top.analysis.util.NamedThreadFactory;
import com.taobao.top.analysis.util.ReportUtil;
import com.taobao.top.analysis.util.ZKUtil;


/**
* JobManager会被MasterNode以单线程方式调用
* 需要注意的是所有的内置Builder,Exporter,ResultMerger,ServerConnector都自己必须保证处理速度
*
* @author fangweng
* @Email fangweng@taobao.com 2011-11-28
*
*/
public class JobManager implements IJobManager {

    private static final Log logger = LogFactory.getLog(JobManager.class);

    private IJobBuilder jobBuilder;
    private IJobExporter jobExporter;
    private IJobResultMerger jobResultMerger;
    private MasterConfig config;
    private MasterNode masterNode;
    /**
     * 所负责的管理的任务集合
     */
    private Map<String, Job> jobs;

    /**
     * slave 返回得结果数据
     */
    private Map<String, BlockingQueue<JobTaskResult>> jobTaskResultsQueuePool;
    /**
     * 任务池
     * 任务池的分配方式可能会产生分配不均等,也不是很好的分配策略
     */
    private ConcurrentMap<String, JobTask> jobTaskPool;
   
    /**
     * 任务队列
     */
    private BlockingQueue<JobTask> undoTaskQueue;
   
    /**
     * 任务状态池
     */
    private ConcurrentMap<String, JobTaskStatus> statusPool;
    /**
     * 未何并的中间结果
     */
    private Map<String, BlockingQueue<JobMergedResult>> branchResultQueuePool;

    /**
     * 事件处理线程
     */
    private ThreadPoolExecutor eventProcessThreadPool;

    /**
     * 用于合并后台历史数据,当master出错时,slave会纪录一些数据在本地用于恢复
     */
    private MasterDataRecoverWorker masterDataRecoverWorker;
   
   
    /**
     * 关闭标志,重启关闭时置为true
     * 置为true后,不再分配新的任务,并等待任务merge完成
     * 导出中间结果
     */
    private volatile boolean stopped = false;
   
    ZooKeeper zk = null;

    @Override
    public void init() throws AnalysisException {
        // 获得任务数量
        jobBuilder.setConfig(config);
        jobExporter.setConfig(config);
        jobResultMerger.setConfig(config);

        jobBuilder.init();
        jobExporter.init();
        jobResultMerger.init();

        jobs = jobBuilder.build();
        for(Job job : jobs.values()) {
            job.reset(null);
        }

        if (jobs == null || (jobs != null && jobs.size() == 0))
            throw new AnalysisException("jobs should not be empty!");

        jobTaskPool = new ConcurrentHashMap<String, JobTask>();
        undoTaskQueue = new LinkedBlockingDeque<JobTask>();
        statusPool = new ConcurrentHashMap<String, JobTaskStatus>();
        jobTaskResultsQueuePool = new HashMap<String, BlockingQueue<JobTaskResult>>();
        branchResultQueuePool = new HashMap<String, BlockingQueue<JobMergedResult>>();

        for (String jobName : jobs.keySet()) {
            jobTaskResultsQueuePool.put(jobName, new LinkedBlockingQueue<JobTaskResult>());
            branchResultQueuePool.put(jobName, new LinkedBlockingQueue<JobMergedResult>());
        }

        eventProcessThreadPool =
                new ThreadPoolExecutor(this.config.getMaxJobEventWorker(), this.config.getMaxJobEventWorker(), 0,
                    TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), new NamedThreadFactory(
                        "jobManagerEventProcess_worker"));

        masterDataRecoverWorker =
                new MasterDataRecoverWorker(config.getMasterName(), config.getTempStoreDataDir(), jobs, this.config);
        masterDataRecoverWorker.start();

        addJobsToPool();
       
        if (StringUtils.isNotEmpty(config.getZkServer()))
    {
      try
      {
        AnalyzerZKWatcher<MasterConfig> analyzerZKWatcher =
            new AnalyzerZKWatcher<MasterConfig>(config);
        zk = new ZooKeeper(config.getZkServer(),3000,analyzerZKWatcher);
        analyzerZKWatcher.setZk(zk);
       
        ZKUtil.createGroupNodesIfNotExist(zk,config.getGroupId());
      }
      catch(Exception ex)
      {
        logger.error("zk init error!",ex);
      }
    }

        if (logger.isInfoEnabled())
            logger.info("jobManager init end, MaxJobEventWorker size : " + config.getMaxJobEventWorker());

    }


    @Override
    public void releaseResource() {
        stopped = true;

        try {
         // 导出所有结果,暂时不导出中间data,后面看是否需要
            //添加中间结果导出,不导出中间结果,会有部分数据丢失
            long start = System.currentTimeMillis();
            for(JobTask jobTask : this.jobTaskPool.values()) {
                while(JobTaskStatus.DOING.equals(jobTask.getStatus())) {
                    Thread.sleep(10000);
                    if(System.currentTimeMillis() - start > 60000)
                        break;
                }
            }
           
            if (jobs != null)
                for (Job j : jobs.values()) {
                    boolean gotIt = j.getTrunkLock().writeLock().tryLock();

                    if (gotIt) {
                        try {
                            if (!j.isMerged().get()) {
                                List<Map<String, Map<String, Object>>> mergeResults =
                                        new ArrayList<Map<String, Map<String, Object>>>();
                                new MergeJobOperation(j, 0, mergeResults, config, branchResultQueuePool.get(j
                                    .getJobName()), true).run();

                                j.isMerged().set(true);
                                logger.warn("job is timeout, last merge trunk success!");
                            }
                        }
                        finally {
                            j.getTrunkLock().writeLock().unlock();
                        }

                    }
                    JobDataOperation jobd =
                            new JobDataOperation(j, AnalysisConstants.JOBMANAGER_EVENT_EXPORTDATA, this.config);
                    jobd.run();
                    logger.info("releaseResouce now, export job : " + j.getJobName());
//                    while(!j.getTrunkExported().get())
//                        Thread.sleep(3000);
//                    if (!j.isExported().get()) {
//                        jobExporter.exportReport(j, false);
//                        logger.info("releaseResouce now, export job : " + j.getJobName());
//                    }
                }
            if (eventProcessThreadPool != null)
                eventProcessThreadPool.shutdown();

            if (masterDataRecoverWorker != null)
                masterDataRecoverWorker.stopWorker();
        } catch (Throwable e) {
            logger.error("error when stop the node", e);
        }
        finally {
            if (jobs != null)
                jobs.clear();

            if (jobTaskPool != null)
                jobTaskPool.clear();
            if(undoTaskQueue != null)
                undoTaskQueue.clear();

            if (statusPool != null)
                statusPool.clear();

            if (jobTaskResultsQueuePool != null)
                jobTaskResultsQueuePool.clear();

            if (branchResultQueuePool != null)
                branchResultQueuePool.clear();

            if (jobBuilder != null)
                jobBuilder.releaseResource();

            if (jobExporter != null)
                jobExporter.releaseResource();

            if (jobResultMerger != null)
                jobResultMerger.releaseResource();

            logger.info("jobManager releaseResource end");

        }
       
    }


    // 分配任务和结果提交处理由于是单线程处理,
    // 因此本身不用做状态池并发控制,将消耗较多的发送操作交给ServerConnector多线程操作
    @Override
    public void getUnDoJobTasks(GetTaskRequestEvent requestEvent) {

        String jobName = requestEvent.getJobName();
        int jobCount = requestEvent.getRequestJobCount();
        final List<JobTask> jobTasks = new ArrayList<JobTask>();

        //如果关闭,则直接返回一个空的JobTask的list给slave
        if(this.stopped) {
            masterNode.echoGetJobTasks(requestEvent.getSequence(), jobTasks, requestEvent.getChannel());
            return;
        }
        // 指定job
        if (jobName != null && jobs.containsKey(jobName)) {
            Job job = jobs.get(jobName);

            List<JobTask> tasks = job.getJobTasks();

            for (JobTask jobTask : tasks) {
                if (jobTask.getStatus().equals(JobTaskStatus.UNDO)) {
                    if (statusPool.replace(jobTask.getTaskId(), JobTaskStatus.UNDO, JobTaskStatus.DOING)) {
                        this.allocateTask(jobTask);
                        jobTasks.add(jobTask);

                        if (jobTasks.size() == jobCount)
                            break;
                    }
                }
            }
        }
        else {
            Iterator<JobTask> taskIter = undoTaskQueue.iterator();

            while (taskIter.hasNext()) {
//                String taskId = taskIds.next();
//                JobTask jobTask = jobTaskPool.get(taskId);
                JobTask jobTask = taskIter.next();
                if (!jobTaskPool.keySet().contains(jobTask.getTaskId())
                        || jobs.get(jobTask.getJobName()).getEpoch().get() > jobTask.getJobEpoch()
                        || jobs.get(jobTask.getJobName()).getJobTimeOut().get()) {
                    taskIter.remove();
                    continue;
                }
               
                if (jobs.get(jobTask.getJobName()).getJobConfig().getSlaveIpCondition() != null) {
                    try {
                        Channel channel = (Channel) requestEvent.getChannel();
                        if (!channel.getRemoteAddress().toString()
                            .matches(jobs.get(jobTask.getJobName()).getJobConfig().getSlaveIpCondition())) {
                            continue;
                        }
                    }
                    catch (Throwable e) {
                        logger.error(e);
                    }
                }

                if (statusPool.get(jobTask.getTaskId()).equals(JobTaskStatus.UNDO)) {
                    if (statusPool.replace(jobTask.getTaskId(), JobTaskStatus.UNDO, JobTaskStatus.DOING)) {
                        this.allocateTask(jobTask);
                        jobTasks.add(jobTask);
                        taskIter.remove();

                        if (jobTasks.size() >= jobCount)
                            break;
                    }
                } else
                    taskIter.remove();
            }
        }

        // 是否需要用异步方式发送,减少对jobManager事件处理延时
        if (config.isUseAsynModeToSendResponse()) {
            final String sequence = requestEvent.getSequence();
            final Object channel = requestEvent.getChannel();

            // 由于该操作比较慢,开线程执行,保证速度
            eventProcessThreadPool.execute(new Runnable() {
                public void run() {
                    try {
                        masterNode.echoGetJobTasks(sequence, jobTasks, channel);
                    }
                    catch (Throwable e) {
                        logger.error(e);
                    }
                }
            });
        }
        else
            masterNode.echoGetJobTasks(requestEvent.getSequence(), jobTasks, requestEvent.getChannel());

    }


    private void allocateTask(JobTask jobTask) {
        jobTask.setStatus(JobTaskStatus.DOING);
        jobTask.setStartTime(System.currentTimeMillis());
    }


    // 分配任务和结果提交处理由于是单线程处理,
    // 因此本身不用做状态池并发控制,将消耗较多的发送操作交给ServerConnector多线程操作
    @Override
    public void addTaskResultToQueue(SendResultsRequestEvent jobResponseEvent) {

        JobTaskResult jobTaskResult = jobResponseEvent.getJobTaskResult();

        if (jobTaskResult.getTaskIds() != null && jobTaskResult.getTaskIds().size() > 0) {
            // 判断是否是过期的一些老任务数据,根据task和taskresult的createtime来判断
            // 以后要扩展成为如果发现当前的epoch < 结果的epoch,表明这台可能是从属的master,负责reduce,但是速度跟不上了
            if(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)) == null) {
                logger.error("jobTask is null " + jobTaskResult.getTaskIds().get(0));
                masterNode.echoSendJobTaskResults(jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
                return;
            }
            if (jobTaskResult.getJobEpoch() != jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch() && this.config.getDispatchMaster()) {
             
              // 结果过期, 肯能是任务超时后, 被重新分配了
                if (jobTaskResult.getJobEpoch() < jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) {
                    logger.error("old task result will be discard! job:" + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName() + ",epoch:" + jobTaskResult.getJobEpoch() + ",slave:" + jobResponseEvent.getChannel());
                    masterNode.echoSendJobTaskResults(jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
                    return;
                }
                else {
                    // 给一定的容忍时间,暂时定为5秒
                    jobs.get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()).blockToResetJob(15000);
                   
                    // 这块有点疑问, 什么情况会出现
                    if (jobTaskResult.getJobEpoch() > jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch()) {
                        logger.error("otherMaster can't merge in time!job:" + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName() + ",taskResult epoch:" + jobTaskResult.getJobEpoch() + ", task epoch:" + jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobEpoch());
                        masterNode.echoSendJobTaskResults(jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
                        if(!this.config.getDispatchMaster()) {
                            jobs.get(jobTaskResult.getJobName()).reset(this);
                        } else {
                            return;
                        }
                    }
                }
            }

            if (logger.isWarnEnabled()) {
                StringBuilder ts =
                        new StringBuilder("Receive slave analysis result, jobTaskIds : ")
                            .append(jobTaskResult.toString()).append(", ").append(jobTaskResult.getTaskIds().size());
                logger.warn(ts.toString());
            }
            if(jobs.get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()).isMerged().get()) {
                masterNode.echoSendJobTaskResults(jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
                return;
            }

            // 先放入队列,防止小概率多线程并发问题
            jobTaskResultsQueuePool.get(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName()).offer(
                jobTaskResult);
            if(logger.isInfoEnabled()) {
                StringBuilder sb = new StringBuilder("add result [");
                for(String s : jobTaskResult.getTaskIds()) {
                    sb.append(s).append(",");
                }
                sb.append("] to queue:").append(jobTaskPool.get(jobTaskResult.getTaskIds().get(0)).getJobName());
                logger.info(sb.toString());
            }

            Iterator<String> iter = jobTaskResult.getTaskIds().iterator();
            while (iter.hasNext()) {
                String taskId = iter.next();
                JobTask jobTask = jobTaskPool.get(taskId);
               
                if (jobTask == null)
                {    
                  logger.error(new StringBuilder("taskId :").append(taskId).append("not exist!").toString());
                  continue;
                }
               
                Job job = jobs.get(jobTask.getJobName());
                if(job == null) {
                    logger.error(new StringBuilder("job :").append(jobTask.getJobName()).append("not exist!").toString());
                    continue;
                }

                if (statusPool.replace(taskId, JobTaskStatus.DOING, JobTaskStatus.DONE)
                        || statusPool.replace(taskId, JobTaskStatus.UNDO, JobTaskStatus.DONE)) {
                    logger.info("task " + jobTask.getJobName() + " of job " + job.getJobName() + " done");
                    jobTask.setStatus(JobTaskStatus.DONE);
                    jobTask.getTailCursor().compareAndSet(true, false);
                    jobTask.setEndTime(System.currentTimeMillis());
                    jobTask.setLastMergedEpoch(job.getEpoch().get());
                    job.getCompletedTaskCount().incrementAndGet();
                } else {
                    if(!this.config.getDispatchMaster()) {
                        jobTask.setStatus(JobTaskStatus.DONE);
                        jobTask.getTailCursor().compareAndSet(true, false);
                        jobTask.setEndTime(System.currentTimeMillis());
                        jobTask.setLastMergedEpoch(job.getEpoch().get());
                        statusPool.put(taskId, JobTaskStatus.DONE);
                        iter.remove();
                    }
                }
               
                //对jobTask的执行结果打点
                StringBuilder log = new StringBuilder(ReportUtil.SLAVE_LOG).append(",timeStamp=")
                          .append(System.currentTimeMillis()).append(",epoch=")
                          .append(job.getEpoch()).append(",jobName=");
                log.append(jobTask.getJobName()).append(",taskId=")
                  .append(jobTask.getTaskId()).append(",recycleCounter=")
                  .append(jobTask.getRecycleCounter().get()).append(",slaveIp=")
                  .append(jobTaskResult.getSlaveIp()).append(",efficiency=")
                  .append(jobTaskResult.getEfficiency()).append(",");
              
                JobTaskExecuteInfo executeInfo = jobTaskResult.getTaskExecuteInfos().get(jobTask.getTaskId());
               
                if (executeInfo != null) {
                    log.append("analysisConsume=").append(executeInfo.getAnalysisConsume()).append(",")
                        .append("jobDataSize=").append(executeInfo.getJobDataSize()).append(",").append("totalLine=")
                        .append(executeInfo.getTotalLine()).append(",").append("errorLine=")
                        .append(executeInfo.getErrorLine()).append(",").append("emptyLine=")
                        .append(executeInfo.getEmptyLine()).append(",fileBegin=").append(executeInfo.getFileBegin())
                        .append(",fileLength=").append(executeInfo.getFileLength());
                    if(jobTask.getInput().startsWith("hub:")) {
                        jobTask.setJobSourceTimeStamp(executeInfo.getTimestamp());
                        job.updateCursor(jobTask.getUrl(), executeInfo.getFileBegin(), executeInfo.getFileLength(), executeInfo.getTimestamp());
                    }
                }
                else
                  logger.error(new StringBuilder().append("taskId : ").
                      append(jobTask.getTaskId()).append(" executeInfo is null!").toString());
               
                ReportUtil.clusterLog(log.toString());
               
               
                //增加一块对于zookeeper的支持
            if (StringUtils.isNotEmpty(config.getZkServer()) && zk != null)
            {
              try
              {            
                ZKUtil.updateOrCreateNode(zk,new StringBuilder()
                      .append(ZKUtil.getGroupMasterZKPath(config.getGroupId()))
                      .append("/").append(config.getMasterName())
                      .append("/runtime/").append(job.getEpoch())
                      .append("/").append(jobTask.getJobName())
                      .append("/").append(jobTask.getTaskId()).toString(),log.toString().getBytes("UTF-8"));
               
              }
              catch(Exception ex)
              {
                logger.error("log to zk error!",ex);
              }
             
            }
               
            }

        }

        // 是否需要用异步方式发送,减少对jobManager事件处理延时
        if (config.isUseAsynModeToSendResponse()) {
            final String sequence = jobResponseEvent.getSequence();
            final Object channel = jobResponseEvent.getChannel();

            eventProcessThreadPool.execute(new Runnable() {
                public void run() {
                    try {
                        masterNode.echoSendJobTaskResults(sequence, "success", channel);
                    }
                    catch (Throwable e) {
                        logger.error(e);
                    }
                }
            });
        }
        else
            masterNode.echoSendJobTaskResults(jobResponseEvent.getSequence(), "success", jobResponseEvent.getChannel());
    }


    @Override
    public void exportJobData(String jobName) {

        if (jobs.containsKey(jobName)) {
            jobExporter.exportEntryData(jobs.get(jobName));
        }
        else {
            logger.error("exportJobData do nothing, jobName " + jobName + " not exist!");
        }

    }


    @Override
    public void loadJobData(String jobName) {
        if (jobs.containsKey(jobName)) {
            jobExporter.loadEntryData(jobs.get(jobName));
        }
        else {
            logger.error("exportJobData do nothing, jobName " + jobName + " not exist!");
        }
    }


    /**
     * 从某一个备份载入job的临时数据开始恢复
     *
     * @param jobName
     * @param epoch
     */
    @Override
    public void loadJobBackupData(String jobName, String bckPrefix) {
        if (jobs.containsKey(jobName)) {
            jobExporter.loadJobBackupData(jobs.get(jobName), bckPrefix);
        }
        else {
            logger.error("loadJobBackupData do nothing, jobName " + jobName + " not exist!");
        }
    }


    @Override
    public void loadJobDataToTmp(String jobName) {
        if (jobs.containsKey(jobName)) {
            jobExporter.loadEntryDataToTmp(jobs.get(jobName));
        }
        else {
            logger.error("exportJobData do nothing, jobName " + jobName + " not exist!");
        }
    }


    @Override
    public void clearJobData(String jobName) {

        Job job = jobs.get(jobName);

        if (job != null) {
            job.getJobResult().clear();

            if (logger.isWarnEnabled())
                logger.warn("clear job :" + job.getJobName() + " data.");
        }
    }


    @Override
    public synchronized void checkJobStatus() throws AnalysisException {

        // 通过外部事件激发重新载入配置
        if (jobBuilder.isNeedRebuild()) {
            if(logger.isInfoEnabled()) {
                logger.info("check job status need to rebuild");
            }
            jobs = jobBuilder.rebuild(jobs);

            if (jobs == null || (jobs != null && jobs.size() == 0))
                throw new AnalysisException("jobs should not be empty!");
        }

        try {
            if(this.config.getDispatchMaster())
                checkTaskStatus();
        } catch (Throwable e) {
            logger.error("checkTaskStatus Error", e);
        }
       
        // 合并任务,并导出报表
        try {
            mergeAndExportJobs();
        } catch (Throwable e) {
            logger.error("mergeAndExport Error", e);
        }
       
        //任务全部完成并且没有新加任务的情况下,休息1s
        for(Job job : jobs.values()) {
            if(!job.isExported().get() || job.getRebuildTag() == 2) {
                return;
            } else {
                try {
                    Thread.sleep(1000);
                }
                catch (InterruptedException e) {
                    logger.error(e);
                }
            }
        }
       
        // 打点观察Direct Memory区域的大小
        try {
            Class<?> c = Class.forName("java.nio.Bits");
            Field maxMemory = c.getDeclaredField("maxMemory");
            maxMemory.setAccessible(true);
            Field reservedMemory = c.getDeclaredField("reservedMemory");
            reservedMemory.setAccessible(true);
            synchronized (c) {
                Long maxMemoryValue = (Long) maxMemory.get(null);
                Long reservedMemoryValue = (Long) reservedMemory.get(null);
                if (logger.isInfoEnabled()) {
                    logger.info("now the maxMemory is " + String.valueOf(maxMemoryValue)
                            + " and the reservedMemory is " + String.valueOf(reservedMemoryValue));
                }
            }
        }
        catch (Throwable e) {
            logger.error("trying to get java.nio.Bits class failed");
        }

    }


    // 重新增加任务到任务池中
    protected void addJobsToPool() {
        for (Job job : jobs.values()) {
            List<JobTask> tasks = job.getJobTasks();

            for (JobTask task : tasks) {
                jobTaskPool.put(task.getTaskId(), task);
                statusPool.put(task.getTaskId(), task.getStatus());
                undoTaskQueue.offer(task);
            }
           
            if(jobTaskResultsQueuePool.get(job.getJobName()) == null)
                jobTaskResultsQueuePool.put(job.getJobName(), new LinkedBlockingQueue<JobTaskResult>());
            if(branchResultQueuePool.get(job.getJobName()) == null)
                branchResultQueuePool.put(job.getJobName(), new LinkedBlockingQueue<JobMergedResult>());
        }

    }


    // 做合并和导出,重置任务的检查操作
    //所有任务一起来轮询,对Master来讲,有点资源浪费
    //可以通过以下几种方式改进:
    //1、针对job的属性设置监听器,Listener模式
    //2、使用Observer模式
    protected void mergeAndExportJobs() {
        Iterator<Map.Entry<String, Job>> iter = jobs.entrySet().iterator();
        while(iter.hasNext()) {
          Job job = iter.next().getValue();
          if(job.getRebuildTag() == 2) {
              job.rebuild(0, null, this);
              continue;
          }
          if (!job.getJobTimeOut().get())
          {
            // 需要合并该job的task
            if (!job.isMerging().get() && job.needMerge()) {
                logger.warn("job " + job.getJobName()
                     + " complete tasks:" + job.getCompletedTaskCount().get() + ", merged tasks :" + job.getMergedTaskCount().get());
                final Job j = job;
                final BlockingQueue<JobMergedResult> branchResultQueue = branchResultQueuePool.get(j.getJobName());
                final BlockingQueue<JobTaskResult> jobTaskResultsQueue = jobTaskResultsQueuePool.get(j.getJobName());
   
                if (j.isMerging().compareAndSet(false, true))
                    eventProcessThreadPool.execute(new Runnable() {
                        public void run() {
                            try {
                                jobResultMerger.merge(j, branchResultQueue, jobTaskResultsQueue, true);
                            } catch (Throwable e) {
                                logger.error(e);
                            }
                            finally {
                                j.isMerging().set(false);
                            }
                        }
                    });
            }
          }
          else
          {
            // Job超时了, 尝试做一次主干merge
            //判断是否还有和主干合并的线程,如果没有可以设置完成标识
            boolean gotIt = job.getTrunkLock().writeLock().tryLock();
           
            if (gotIt)
            {
              try
              if(!job.isMerged().get())
                {
                  List<Map<String, Map<String, Object>>> mergeResults = new ArrayList<Map<String, Map<String, Object>>>();
                  new MergeJobOperation(job,0,mergeResults,config,branchResultQueuePool.get(job.getJobName())).run();
               
                  job.isMerged().set(true);
                  logger.warn("job is timeout, last merge trunk success!");
                }
              }
              finally
              {
                job.getTrunkLock().writeLock().unlock();
              }
             
            }
           
          }

            // 需要导出该job的数据
            if (!job.isExporting().get() && job.needExport()) {
                final Job j = job;

                if (j.isExporting().compareAndSet(false, true))
                    eventProcessThreadPool.execute(new Runnable() {
                        public void run() {
                            try {
                                // 虽然是多线程,但还是阻塞模式来做
                                jobExporter.exportReport(j, false);
                                j.isExported().set(true);
                            } catch (Throwable e) {
                                logger.error(e);
                            }
                            finally {
                                j.isExporting().set(false);
                            }

                            // 判断是否需要开始导出中间结果,放在外部不妨碍下一次的处理
                            exportOrCleanTrunk(j);
                        }
                    });
            }
           
            //做一次任务处理时间判断,如果超时将设置job的超时状态位置
            if(this.config.getDispatchMaster())
                job.checkJobTimeOut();

            // 任务是否需要被重置
            if (job.needReset() || (!this.config.getDispatchMaster() && job.isExported().get()) ) {
                if(logger.isWarnEnabled())
                    logger.warn("job " + job.getJobName() + " be reset now.");
               
                //检查任务是否需要重新build
                if(job.getRebuildTag() == -1) {
                    job.rebuild(0, null, this);
                    iter.remove();
                }
                if(job.getRebuildTag() == 1) {
                    job.rebuild(0, null, this);
                }
               
              StringBuilder sb = new StringBuilder(ReportUtil.MASTER_LOG).append(",timeStamp=")
                          .append(System.currentTimeMillis()).append(",epoch=");
              sb.append(job.getEpoch()).append(",jobName=")
                .append(job.getJobName()).append(",timeConsume=")
                .append(System.currentTimeMillis() - job.getStartTime()).append(",jobMergeTime=")
                .append(job.getJobMergeTime().get()).append(",jobExportTime=")
                .append(job.getJobExportTime()).append(",taskCount=")
                .append(job.getTaskCount()).append(",completedTaskCount=")
                .append(job.getCompletedTaskCount().get()).append(",mergedTaskCount=")
                .append(job.getMergedTaskCount().get()).append(",jobMergeBranchCount=")
                .append(job.getJobMergeBranchCount().get());
              ReportUtil.clusterLog(sb.toString());
               
              //增加一块对于zookeeper的支持
            if (StringUtils.isNotEmpty(config.getZkServer()) && zk != null)
            {
              try
              {            
                ZKUtil.updateOrCreateNode(zk,new StringBuilder()
                      .append(ZKUtil.getGroupMasterZKPath(config.getGroupId()))
                      .append("/").append(config.getMasterName())
                      .append("/runtime/").append(job.getEpoch())
                      .append("/").append(job.getJobName()).toString(),sb.toString().getBytes("UTF-8"));
               
              }
              catch(Exception ex)
              {
                logger.error("log to zk error!",ex);
              }
             
            }
             

                job.reset(this);
               
                if (logger.isInfoEnabled()) {
                    sb = new StringBuilder("jobManager:{jobs:").append(jobs.size()).append(
                                ",jobTaskPool:" + jobTaskPool.size());
                    sb.append(",statusPool:").append(statusPool.size()).append(",undoTasks:")
                        .append(undoTaskQueue.size()).append("}");
                    logger.info(sb.toString());
                }

                List<JobTask> tasks = job.getJobTasks();

                for (JobTask task : tasks) {
                    statusPool.put(task.getTaskId(), task.getStatus());
                }
            }
        }
    }


    /**
     * 在导出数据以后,判断是否需要清空主干,是否需要导出主干
     *
     * @param job
     */
    protected void exportOrCleanTrunk(Job job) {
        boolean needToSetJobResultNull = false;

        // 判断是否到了报表的有效时间段,支持小时,日,月三种方式
        if (job.getJobConfig().getReportPeriodDefine().equals(AnalysisConstants.REPORT_PERIOD_DAY)) {
            Calendar calendar = Calendar.getInstance();
            int now = calendar.get(Calendar.DAY_OF_MONTH);

            if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag())
                needToSetJobResultNull = true;

            job.setReportPeriodFlag(now);
        }
        else {
            if (job.getJobConfig().getReportPeriodDefine().equals(AnalysisConstants.REPORT_PERIOD_HOUR)) {
                Calendar calendar = Calendar.getInstance();
                int now = calendar.get(Calendar.HOUR_OF_DAY);

                if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag())
                    needToSetJobResultNull = true;

                job.setReportPeriodFlag(now);
            }
            else {
                if (job.getJobConfig().getReportPeriodDefine().equals(AnalysisConstants.REPORT_PERIOD_MONTH)) {
                    Calendar calendar = Calendar.getInstance();
                    int now = calendar.get(Calendar.MONTH);

                    if (job.getReportPeriodFlag() != -1 && now != job.getReportPeriodFlag())
                        needToSetJobResultNull = true;

                    job.setReportPeriodFlag(now);
                }
            }
        }

        if (needToSetJobResultNull) {
            job.setJobResult(null);
            job.getEpoch().set(0);

            // 删除临时文件,防止重复载入使得清空不生效
            if (config.getSaveTmpResultToFile()) {
                JobDataOperation jobDataOperation =
                        new JobDataOperation(job, AnalysisConstants.JOBMANAGER_EVENT_DEL_DATAFILE, this.config);
                jobDataOperation.run();
            }
           
            if(logger.isWarnEnabled())
                logger.warn("job " + job.getJobName() + " report data be reset.it's a new start. ");
        }

        // 清除主干数据,到时候自然会载入
        if (config.getSaveTmpResultToFile() && (job.getJobConfig().getSaveTmpResultToFile() == null || job.getJobConfig().getSaveTmpResultToFile())) {
            logger.warn("@disk2Mem mode: start " + job.getJobName() + " store trunk to disk now .");

            JobDataOperation jobDataOperation =
                    new JobDataOperation(job, AnalysisConstants.JOBMANAGER_EVENT_SETNULL_EXPORTDATA, this.config);
            jobDataOperation.run();

        }
        else {
            if (job.getLastExportTime() == 0
                    || System.currentTimeMillis() - job.getLastExportTime() >= config.getExportInterval() || stopped) {
                logger.warn("export job: " + job.getJobName() + " trunk to disk.");

                JobDataOperation jobDataOperation =
                        new JobDataOperation(job, AnalysisConstants.JOBMANAGER_EVENT_EXPORTDATA, this.config);
                jobDataOperation.run();
            }
        }
    }


    // 重置在指定时间内未完成的任务
    protected void checkTaskStatus() {
        Iterator<String> taskIds = statusPool.keySet().iterator();

        while (taskIds.hasNext()) {
            String taskId = taskIds.next();

            JobTaskStatus taskStatus = statusPool.get(taskId);
            JobTask jobTask = jobTaskPool.get(taskId);

            if (taskStatus == JobTaskStatus.DOING && jobTask.getStartTime() != 0
                    && System.currentTimeMillis() - jobTask.getStartTime() >= jobTask.getTaskRecycleTime() * 1000) {
                if (statusPool.replace(taskId, JobTaskStatus.DOING, JobTaskStatus.UNDO)) {
                    jobTask.setStatus(JobTaskStatus.UNDO);
                    undoTaskQueue.offer(jobTask);
                    jobTask.getRecycleCounter().incrementAndGet();

                    if (logger.isWarnEnabled())
                        logger.warn("Task : " + jobTask.getTaskId() + " can't complete in time, it be recycle.");
                }
            }
        }
    }


    @Override
    public MasterConfig getConfig() {
        return config;
    }


    @Override
    public void setConfig(MasterConfig config) {
        this.config = config;
    }


    @Override
    public Map<String, Job> getJobs() {
        return jobs;
    }


    @Override
    public void setJobs(Map<String, Job> jobs) {
        this.jobs = jobs;
    }


    @Override
    public IJobBuilder getJobBuilder() {
        return jobBuilder;
    }


    @Override
    public void setJobBuilder(IJobBuilder jobBuilder) {
        this.jobBuilder = jobBuilder;
    }


    @Override
    public IJobExporter getJobExporter() {
        return jobExporter;
    }


    @Override
    public void setJobExporter(IJobExporter jobExporter) {
        this.jobExporter = jobExporter;
    }


    @Override
    public IJobResultMerger getJobResultMerger() {
        return jobResultMerger;
    }


    @Override
    public void setJobResultMerger(IJobResultMerger jobResultMerger) {
        this.jobResultMerger = jobResultMerger;
    }


    @Override
    public void setMasterNode(MasterNode masterNode) {
        this.masterNode = masterNode;
    }


    public Map<String, BlockingQueue<JobTaskResult>> getJobTaskResultsQueuePool() {
        return jobTaskResultsQueuePool;
    }


    public void setJobTaskResultsQueuePool(Map<String, BlockingQueue<JobTaskResult>> jobTaskResultsQueuePool) {
        this.jobTaskResultsQueuePool = jobTaskResultsQueuePool;
    }


    public Map<String, BlockingQueue<JobMergedResult>> getBranchResultQueuePool() {
        return branchResultQueuePool;
    }


    public void setBranchResultQueuePool(Map<String, BlockingQueue<JobMergedResult>> branchResultQueuePool) {
        this.branchResultQueuePool = branchResultQueuePool;
    }


    /**
     * @return the jobTaskPool
     */
    public ConcurrentMap<String, JobTask> getJobTaskPool() {
        return jobTaskPool;
    }


    /**
     * @return the statusPool
     */
    public ConcurrentMap<String, JobTaskStatus> getStatusPool() {
        return statusPool;
    }


    /**
     * @return the undoTaskQueue
     */
    public BlockingQueue<JobTask> getUndoTaskQueue() {
        return undoTaskQueue;
    }

}
TOP

Related Classes of com.taobao.top.analysis.node.component.JobManager

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.